Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -127,7 +127,7 @@ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, @@ -144,7 +144,7 @@ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">> + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">> ]>; def CC_AMDGPU : CallingConv<[ Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -73,7 +73,9 @@ case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { // Up to SGPR0-SGPR39 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::SGPR_64RegClass, 20); @@ -94,7 +96,9 @@ case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::VReg_64RegClass, 31); } @@ -1295,6 +1299,16 @@ SelectionDAG &DAG) const { SmallVector Args; + EVT VT = Op.getValueType(); + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SDLoc SL(Op); + SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0)); + SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1)); + + SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, BV); + } + for (const SDUse &U : Op->ops()) DAG.ExtractVectorElements(U.get(), Args); Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -1068,8 +1068,7 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. defm : MUBUF_LoadIntrinsicPat; @@ -1129,8 +1128,7 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; - defm : MUBUF_StoreIntrinsicPat; - defm : MUBUF_StoreIntrinsicPat; + defm : MUBUF_StoreIntrinsicPat; } // End HasPackedD16VMem. defm : MUBUF_StoreIntrinsicPat; @@ -1555,8 +1553,7 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; - defm : MTBUF_LoadIntrinsicPat; - defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. multiclass MTBUF_StoreIntrinsicPat; defm : MTBUF_StoreIntrinsicPat; - defm : MTBUF_StoreIntrinsicPat; - defm : MTBUF_StoreIntrinsicPat; + defm : MTBUF_StoreIntrinsicPat; } // End HasPackedD16VMem. //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -594,12 +594,6 @@ def intr#_pat_v4 : ImageDimPattern; } -// v2f16 and v4f16 are used as data types to signal that D16 should be used. -// However, they are not (always) legal types, and the SelectionDAG requires us -// to legalize them before running any patterns. So we legalize them by -// converting to an int type of equal size and using an internal 'd16helper' -// intrinsic instead which signifies both the use of D16 and actually allows -// this integer-based return type. multiclass ImageDimD16Helper { let SubtargetPredicate = HasUnpackedD16VMem in { @@ -611,7 +605,7 @@ let SubtargetPredicate = HasPackedD16VMem in { def _packed_v1 : ImageDimPattern; def _packed_v2 : ImageDimPattern; - def _packed_v4 : ImageDimPattern; + def _packed_v4 : ImageDimPattern; } // End HasPackedD16VMem. } @@ -653,10 +647,7 @@ } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - def intr#_packed_v4 : - ImageDimPattern( - "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name), - "_V2", v2i32, "_D16">; + def intr#_packed_v4 : ImageDimPattern; } // End HasPackedD16VMem. } @@ -703,6 +694,7 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16">; defm : ImageSampleDataPatterns(opcode # _V1), v2f16, "_D16">; + defm : ImageSampleDataPatterns(opcode # _V2), v4f16, "_D16">; } // End HasPackedD16VMem. } @@ -712,16 +704,15 @@ defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; defm : ImageSampleDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; } // End HasUnpackedD16VMem. - - let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16">; - defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16">; - } // End HasPackedD16VMem. } // ImageGather4 patterns. multiclass ImageGather4Patterns { defm : ImageSampleDataPatterns(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns(opcode # _V2), v4f16, "_D16">; + } // End HasPackedD16VMem. } // ImageGather4 alternative patterns for illegal vector half Types. @@ -730,9 +721,6 @@ defm : ImageSampleDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; } // End HasUnpackedD16VMem. - let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16">; - } // End HasPackedD16VMem. } // ImageLoad for amdgcn. @@ -766,6 +754,7 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16">; defm : ImageLoadDataPatterns(opcode # _V1), v2f16, "_D16">; + defm : ImageLoadDataPatterns(opcode # _V2), v4f16, "_D16">; } // End HasPackedD16VMem. } @@ -775,11 +764,6 @@ defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16_gfx80">; defm : ImageLoadDataPatterns(opcode # _V4), v4i32, "_D16_gfx80">; } // End HasUnPackedD16VMem. - - let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16">; - defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16">; - } // End HasPackedD16VMem. } // ImageStore for amdgcn. @@ -813,6 +797,7 @@ let SubtargetPredicate = HasPackedD16VMem in { defm : ImageStoreDataPatterns(opcode # _V1), f16, "_D16">; defm : ImageStoreDataPatterns(opcode # _V1), v2f16, "_D16">; + defm : ImageStoreDataPatterns(opcode # _V2), v4f16, "_D16">; } // End HasPackedD16VMem. } Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -268,7 +268,10 @@ EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -143,6 +143,8 @@ // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v4i16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v4f16, &AMDGPU::SReg_32_XM0RegClass); } computeRegisterProperties(STI.getRegisterInfo()); @@ -237,7 +239,7 @@ // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64}) { + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -260,6 +262,8 @@ } } + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); + // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that // is expanded to avoid having two separate loops in case the index is a VGPR. @@ -426,7 +430,7 @@ if (!Subtarget->hasFP16Denormals()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16}) { + for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -488,6 +492,10 @@ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + if (!Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -520,8 +528,31 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + + setOperationAction(ISD::SHL, MVT::v4i16, Custom); + setOperationAction(ISD::SRA, MVT::v4i16, Custom); + setOperationAction(ISD::SRL, MVT::v4i16, Custom); + setOperationAction(ISD::ADD, MVT::v4i16, Custom); + setOperationAction(ISD::SUB, MVT::v4i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i16, Custom); + + setOperationAction(ISD::SMIN, MVT::v4i16, Custom); + setOperationAction(ISD::SMAX, MVT::v4i16, Custom); + setOperationAction(ISD::UMIN, MVT::v4i16, Custom); + setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + + setOperationAction(ISD::FADD, MVT::v4f16, Custom); + setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); + + setOperationAction(ISD::SELECT, MVT::v4i16, Custom); + setOperationAction(ISD::SELECT, MVT::v4f16, Custom); } + setOperationAction(ISD::FNEG, MVT::v4f16, Custom); + setOperationAction(ISD::FABS, MVT::v4f16, Custom); + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::SELECT, MVT::v2i16, Promote); AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); @@ -3383,6 +3414,49 @@ // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4f16); + + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); + + SDLoc SL(Op); + SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -3423,6 +3497,24 @@ return lowerTRAP(Op, DAG); case ISD::DEBUGTRAP: return lowerDEBUGTRAP(Op, DAG); + case ISD::FABS: + case ISD::FNEG: + return splitUnaryVectorOp(Op, DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FADD: + case ISD::FMUL: + return splitBinaryVectorOp(Op, DAG); } return SDValue(); } @@ -3630,21 +3722,23 @@ bool Unpacked = Subtarget->hasUnpackedD16VMem(); EVT LoadVT = M->getValueType(0); - EVT UnpackedLoadVT = LoadVT.isVector() ? - EVT::getVectorVT(*DAG.getContext(), MVT::i32, - LoadVT.getVectorNumElements()) : LoadVT; EVT EquivLoadVT = LoadVT; - if (LoadVT.isVector()) { - EquivLoadVT = Unpacked ? UnpackedLoadVT : - getEquivalentMemType(*DAG.getContext(), LoadVT); + if (Unpacked && LoadVT.isVector()) { + EquivLoadVT = LoadVT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) : LoadVT; } // Change from v4f16/v2f16 to EquivLoadVT. SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); - SDValue Load = DAG.getMemIntrinsicNode( - IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, - DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand()); + SDValue Load + = DAG.getMemIntrinsicNode( + IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + if (!Unpacked) // Just adjusted the opcode. + return Load; SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); @@ -3734,8 +3828,10 @@ return; } case ISD::FNEG: { + if (N->getValueType(0) != MVT::v2f16) + break; + SDLoc SL(N); - assert(N->getValueType(0) == MVT::v2f16); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, @@ -3745,8 +3841,10 @@ return; } case ISD::FABS: { + if (N->getValueType(0) != MVT::v2f16) + break; + SDLoc SL(N); - assert(N->getValueType(0) == MVT::v2f16); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, @@ -4247,6 +4345,23 @@ SDLoc SL(Op); EVT VT = Op.getValueType(); + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + + // Turn into pair of packed build_vectors. + // TODO: Special case for constants that can be materialized with s_mov_b64. + SDValue Lo = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(0), Op.getOperand(1) }); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(2), Op.getOperand(3) }); + + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + + SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + assert(VT == MVT::v2f16 || VT == MVT::v2i16); SDValue Lo = Op.getOperand(0); @@ -4913,11 +5028,11 @@ case Intrinsic::amdgcn_image_load: case Intrinsic::amdgcn_image_load_mip: { - EVT LoadVT = Op.getValueType(); - if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) || - LoadVT == MVT::v4f16) { - MemSDNode *M = cast(Op); - return adjustLoadValueType(getImageOpcode(IntrID), M, DAG); + EVT VT = Op.getValueType(); + if (Subtarget->hasUnpackedD16VMem() && + VT.isVector() && VT.getScalarSizeInBits() == 16) { + return adjustLoadValueType(getImageOpcode(IntrID), cast(Op), + DAG); } return SDValue(); @@ -5009,8 +5124,9 @@ return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); } - if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) || - Op.getValueType() == MVT::v4f16) { + if (Subtarget->hasUnpackedD16VMem() && + Op.getValueType().isVector() && + Op.getValueType().getScalarSizeInBits() == 16) { return adjustLoadValueType(getImageOpcode(IntrID), cast(Op), DAG); } @@ -5018,21 +5134,14 @@ return SDValue(); } default: - EVT LoadVT = Op.getValueType(); - if (LoadVT.getScalarSizeInBits() != 16) - return SDValue(); - - const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = - AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID); - if (D16ImageDimIntr) { - bool Unpacked = Subtarget->hasUnpackedD16VMem(); - MemSDNode *M = cast(Op); - - if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16)) - return SDValue(); - - return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr, - M, DAG, true); + if (Subtarget->hasUnpackedD16VMem() && + Op.getValueType().isVector() && + Op.getValueType().getScalarSizeInBits() == 16) { + if (const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = + AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID)) { + return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr, + cast(Op), DAG, true); + } } return SDValue(); @@ -5061,13 +5170,8 @@ return DAG.UnrollVectorOp(ZExt.getNode()); } - if (isTypeLegal(StoreVT)) - return VData; - - // If target supports packed vmem, we just need to workaround - // the illegal type by casting to an equivalent one. - EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); - return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); + assert(isTypeLegal(StoreVT)); + return VData; } SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, @@ -5261,9 +5365,9 @@ case Intrinsic::amdgcn_image_store: case Intrinsic::amdgcn_image_store_mip: { SDValue VData = Op.getOperand(2); - if ((Subtarget->hasUnpackedD16VMem() && - VData.getValueType() == MVT::v2f16) || - VData.getValueType() == MVT::v4f16) { + EVT VT = VData.getValueType(); + if (Subtarget->hasUnpackedD16VMem() && + VT.isVector() && VT.getScalarSizeInBits() == 16) { SDValue Chain = Op.getOperand(0); VData = handleD16VData(VData, DAG); @@ -5293,9 +5397,9 @@ if (D16ImageDimIntr) { SDValue VData = Op.getOperand(2); EVT StoreVT = VData.getValueType(); - if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) && - Subtarget->hasUnpackedD16VMem()) || - !isTypeLegal(StoreVT)) { + if (Subtarget->hasUnpackedD16VMem() && + StoreVT.isVector() && + StoreVT.getScalarSizeInBits() == 16) { SmallVector Ops(Op.getNode()->op_values()); Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); @@ -5521,8 +5625,8 @@ } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() != MVT::i64) - return SDValue(); + EVT VT = Op.getValueType(); + assert(VT.getSizeInBits() == 64); SDLoc DL(Op); SDValue Cond = Op.getOperand(0); @@ -5544,7 +5648,7 @@ SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); - return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); + return DAG.getNode(ISD::BITCAST, DL, VT, Res); } // Catch division cases where we can use shortcuts with rcp and rsq Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -795,6 +795,27 @@ >; } + +def : Pat < + (extract_subvector v4i16:$vec, (i32 0)), + (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) +>; + +def : Pat < + (extract_subvector v4i16:$vec, (i32 2)), + (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) +>; + +def : Pat < + (extract_subvector v4f16:$vec, (i32 0)), + (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) +>; + +def : Pat < + (extract_subvector v4f16:$vec, (i32 2)), + (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) +>; + let SubtargetPredicate = isGCN in { // FIXME: Why do only some of these type combinations for SReg and @@ -834,6 +855,26 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; + +// FIXME: Make SGPR +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + def : BitConvert ; def : BitConvert ; Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -444,13 +444,13 @@ let isAllocatable = 0; } -def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 8; } -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 8; @@ -505,7 +505,7 @@ } // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> { let Size = 64; // Requires 2 v_mov_b32 to copy Index: test/CodeGen/AMDGPU/amdgcn.bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; This test just checks that the compiler doesn't crash. @@ -126,3 +127,163 @@ store <2 x i64> %phi, <2 x i64> addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}v4i16_to_f64: +define amdgpu_kernel void @v4i16_to_f64(double addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4 + %add.v4i16 = add <4 x i16> %load, + %bc = bitcast <4 x i16> %add.v4i16 to double + %fadd.bitcast = fadd double %bc, 1.0 + store double %fadd.bitcast, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4f16_to_f64: +define amdgpu_kernel void @v4f16_to_f64(double addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind { + %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4 + %add.v4half = fadd <4 x half> %load, + %bc = bitcast <4 x half> %add.v4half to double + %fadd.bitcast = fadd double %bc, 1.0 + store double %fadd.bitcast, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_to_v4f16: +define amdgpu_kernel void @f64_to_v4f16(<4 x half> addrspace(1)* %out, double addrspace(1)* %in) nounwind { + %load = load double, double addrspace(1)* %in, align 4 + %fadd32 = fadd double %load, 1.0 + %bc = bitcast double %fadd32 to <4 x half> + %add.bitcast = fadd <4 x half> %bc, + store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_to_v4i16: +define amdgpu_kernel void @f64_to_v4i16(<4 x i16> addrspace(1)* %out, double addrspace(1)* %in) nounwind { + %load = load double, double addrspace(1)* %in, align 4 + %fadd32 = fadd double %load, 1.0 + %bc = bitcast double %fadd32 to <4 x i16> + %add.bitcast = add <4 x i16> %bc, + store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4i16_to_i64: +define amdgpu_kernel void @v4i16_to_i64(i64 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4 + %add.v4i16 = add <4 x i16> %load, + %bc = bitcast <4 x i16> %add.v4i16 to i64 + %add.bitcast = add i64 %bc, 1 + store i64 %add.bitcast, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4f16_to_i64: +define amdgpu_kernel void @v4f16_to_i64(i64 addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind { + %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4 + %add.v4half = fadd <4 x half> %load, + %bc = bitcast <4 x half> %add.v4half to i64 + %add.bitcast = add i64 %bc, 1 + store i64 %add.bitcast, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}bitcast_i64_to_v4i16: +define amdgpu_kernel void @bitcast_i64_to_v4i16(<4 x i16> addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in, align 8 + %add = add i64 %val, 4 + %bc = bitcast i64 %add to <4 x i16> + %add.v4i16 = add <4 x i16> %bc, + store <4 x i16> %add.v4i16, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}bitcast_i64_to_v4f16: +define amdgpu_kernel void @bitcast_i64_to_v4f16(<4 x half> addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in, align 8 + %add = add i64 %val, 4 + %bc = bitcast i64 %add to <4 x half> + %add.v4i16 = fadd <4 x half> %bc, + store <4 x half> %add.v4i16, <4 x half> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v4i16_to_v2f32: +define amdgpu_kernel void @v4i16_to_v2f32(<2 x float> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4 + %add.v4i16 = add <4 x i16> %load, + %bc = bitcast <4 x i16> %add.v4i16 to <2 x float> + %fadd.bitcast = fadd <2 x float> %bc, + store <2 x float> %fadd.bitcast, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4f16_to_v2f32: +define amdgpu_kernel void @v4f16_to_v2f32(<2 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind { + %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4 + %add.v4half = fadd <4 x half> %load, + %bc = bitcast <4 x half> %add.v4half to <2 x float> + %fadd.bitcast = fadd <2 x float> %bc, + store <2 x float> %fadd.bitcast, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2f32_to_v4i16: +define amdgpu_kernel void @v2f32_to_v4i16(<4 x i16> addrspace(1)* %out, <2 x float> addrspace(1)* %in) nounwind { + %load = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 + %add.v2f32 = fadd <2 x float> %load, + %bc = bitcast <2 x float> %add.v2f32 to <4 x i16> + %add.bitcast = add <4 x i16> %bc, + store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2f32_to_v4f16: +define amdgpu_kernel void @v2f32_to_v4f16(<4 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) nounwind { + %load = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 + %add.v2f32 = fadd <2 x float> %load, + %bc = bitcast <2 x float> %add.v2f32 to <4 x half> + %add.bitcast = fadd <4 x half> %bc, + store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4i16_to_v2i32: +define amdgpu_kernel void @v4i16_to_v2i32(<2 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4 + %add.v4i16 = add <4 x i16> %load, + %bc = bitcast <4 x i16> %add.v4i16 to <2 x i32> + %add.bitcast = add <2 x i32> %bc, + store <2 x i32> %add.bitcast, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4f16_to_v2i32: +define amdgpu_kernel void @v4f16_to_v2i32(<2 x i32> addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind { + %load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4 + %add.v4half = fadd <4 x half> %load, + %bc = bitcast <4 x half> %add.v4half to <2 x i32> + %add.bitcast = add <2 x i32> %bc, + store <2 x i32> %add.bitcast, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2i32_to_v4i16: +define amdgpu_kernel void @v2i32_to_v4i16(<4 x i16> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { + %load = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 4 + %add.v2i32 = add <2 x i32> %load, + %bc = bitcast <2 x i32> %add.v2i32 to <4 x i16> + %add.bitcast = add <4 x i16> %bc, + store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2i32_to_v4f16: +define amdgpu_kernel void @v2i32_to_v4f16(<4 x half> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { + %load = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 4 + %add.v2i32 = add <2 x i32> %load, + %bc = bitcast <2 x i32> %add.v2i32 to <4 x half> + %add.bitcast = fadd <4 x half> %bc, + store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/extract_vector_elt-f16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -58,8 +58,14 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3f16: -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; GFX89: s_load_dwordx2 +; GFX89: s_load_dwordx2 + +; GCN: buffer_store_short +; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 { %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 @@ -71,12 +77,14 @@ ; FIXME: Why sometimes vector shift? ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: -; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; GFX89: s_load_dwordx2 s +; GFX89: s_load_dwordx2 s +; GFX89: s_load_dword s -; GFX9-DAG: global_load_short_d16_hi v -; GFX9-DAG: global_load_short_d16 v ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 ; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -58,8 +58,15 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3i16: -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dwordx2 s +; SI: s_load_dword s + +; GFX89: s_load_dwordx2 +; GFX89: s_load_dwordx2 + +; GCN-NOT: {{buffer|flat|global}}_load + ; GCN: buffer_store_short ; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { @@ -77,17 +84,11 @@ ; SI: buffer_store_short ; SI: buffer_store_short -; VI: s_load_dword s -; VI: s_load_dword s -; VI: buffer_store_short -; VI: buffer_store_short - -; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c -; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30 -; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]] -; GFX9-DAG: buffer_store_short [[VLOAD0]], off -; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]] -; GFX9-DAG: buffer_store_short [[VLOAD1]], off +; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c +; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[LOAD0]] +; GFX89-DAG: buffer_store_short [[VLOAD0]], off +; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[LOAD1]] +; GFX89-DAG: buffer_store_short [[VLOAD1]], off define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 @@ -98,19 +99,28 @@ } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; GFX89-DAG: s_load_dwordx2 +; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c +; GFX89-DAG: s_load_dword s + ; GCN-NOT: {{buffer|flat|global}} -; FIXME: Unnecessary repacking -; GFX9: s_pack_ll_b32_b16 -; GFX9: s_pack_lh_b32_b16 +; SICI: buffer_store_short +; SICI: buffer_store_short +; SICI: buffer_store_short -; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 +; SICI: buffer_load_ushort +; SICI: buffer_store_short +; GFX9-NOT: s_pack_ll_b32_b16 +; GFX9-NOT: s_pack_lh_b32_b16 -; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s +; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 +; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}} ; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { Index: test/CodeGen/AMDGPU/fabs.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.f16.ll +++ test/CodeGen/AMDGPU/fabs.f16.ll @@ -39,11 +39,12 @@ } ; GCN-LABEL: {{^}}s_fabs_v4f16: -; GCN: s_load_dword s -; GCN: s_load_dword s +; CI: s_load_dword s[[LO:[0-9]+]] +; CI: s_load_dword s[[HI:[0-9]+]] +; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]] +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]] ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) Index: test/CodeGen/AMDGPU/function-args.ll =================================================================== --- test/CodeGen/AMDGPU/function-args.ll +++ test/CodeGen/AMDGPU/function-args.ll @@ -297,8 +297,8 @@ } ; GCN-LABEL: {{^}}void_func_v3i16: -; GCN-DAG: buffer_store_dword v0, off -; GCN-DAG: buffer_store_short v2, off +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off +; GCN-DAG: buffer_store_short v{{[0-9]+}}, off define void @void_func_v3i16(<3 x i16> %arg0) #0 { store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef ret void @@ -434,10 +434,17 @@ ret void } +; FIXME: Different abi if f16 legal ; GCN-LABEL: {{^}}void_func_v3f16: -; GFX9-NOT: v0 -; GCN-DAG: buffer_store_dword v0, off -; GCN-DAG: buffer_store_short v2, off +; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v0 +; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v1 +; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v2 + +; GFX89-DAG: v0 +; GFX89-DAG: v1 + +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_dword define void @void_func_v3f16(<3 x half> %arg0) #0 { store <3 x half> %arg0, <3 x half> addrspace(1)* undef ret void Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -283,9 +283,8 @@ ; GCN-LABEL: {{^}}v3i16_func_void: ; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off -; GFX9: s_waitcnt vmcnt(0) -; GFX9: v_lshrrev_b32 -; GFX9: s_setpc_b64 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 define <3 x i16> @v3i16_func_void() #0 { %val = load <3 x i16>, <3 x i16> addrspace(1)* undef ret <3 x i16> %val Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -22,14 +22,20 @@ } ; GCN-LABEL: {{^}}load_v3f16_arg: -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dwordx2 +; SI: s_load_dword s +; SI: s_load_dword s + +; VI: s_load_dwordx2 +; VI: s_load_dwordx2 + ; GCN-NOT: {buffer|flat|global}}_load_ -; GCN-NOT: _load -; GCN-DAG: _store_dword -; GCN-DAG: _store_short -; GCN-NOT: _store + +; GCN-NOT: {{flat|global}}_load +; GCN-DAG: {{flat|global}}_store_dword +; GCN-DAG: {{flat|global}}_store_short +; GCN-NOT: {{flat|global}}_store ; GCN: s_endpgm define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { store <3 x half> %arg, <3 x half> addrspace(1)* %out @@ -39,10 +45,13 @@ ; FIXME: Why not one load? ; GCN-LABEL: {{^}}load_v4f16_arg: -; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}} -; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x3|0xc}} -; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]] -; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]] +; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2 +; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3 + +; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 + +; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]] +; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]] ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out @@ -77,8 +86,14 @@ } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dwordx2 s +; SI: s_load_dword s +; SI: s_load_dword s + +; VI: s_load_dwordx2 +; VI: s_load_dwordx2 +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 + ; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -101,10 +116,14 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; VI: s_load_dwordx2 s +; VI: s_load_dwordx2 s +; VI: s_load_dwordx2 s ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -150,8 +169,12 @@ } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: -; GCN: s_load_dword -; GCN: s_load_dword +; SI: s_load_dword +; SI: s_load_dword + +; VI: s_load_dwordx2 +; VI: s_load_dwordx2 + ; GCN: s_lshr_b32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -168,8 +191,10 @@ } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; VI: s_load_dwordx2 s ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -187,11 +212,14 @@ } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; GCN: s_load_dword s -; GCN-NEXT: s_load_dword s -; GCN-NEXT: s_load_dword s -; GCN-NEXT: s_load_dword s -; GCN-NOT: _load_ +; SI: s_load_dword s +; SI-NEXT: s_load_dword s +; SI-NEXT: s_load_dword s +; SI-NEXT: s_load_dword s +; SI-NOT: _load_ + +; VI: s_load_dwordx2 s +; VI: s_load_dwordx2 s ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -226,8 +226,11 @@ ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 -; GCN: s_load_dword s -; GCN: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 +; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { entry: store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 @@ -291,11 +294,8 @@ ; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 -; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30 - -; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8 -; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc +; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c +; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out @@ -391,11 +391,11 @@ ; SI-NOT: {{buffer|flat|global}}_load -; VI: s_load_dwordx2 -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s +; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 +; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c + +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out @@ -528,14 +528,15 @@ ; SI-NOT: {{buffer|flat|global}}_load -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s +; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 +; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c +; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54 +; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c + +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -29,27 +29,21 @@ } ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: - -; UNPACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14 +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} -; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] ; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen - -; PACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; PACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14 - -; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] -; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -66,9 +66,6 @@ ; UNPACKED-DAG: s_and_b32 [[UNPACK_0:s[0-9]+]], [[DATA]], 0xffff ; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_0:[0-9]+]], [[UNPACK_0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_1:[0-9]+]], [[UNPACK_1]] - - - ; UNPACKED: image_store v{{\[}}[[V_UNPACK_0]]:[[V_UNPACK_1]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 ; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 @@ -78,19 +75,17 @@ ret void } -; GCN-LABEL: {{^}}image_store_v4f16 -; UNPACKED: s_load_dword s -; UNPACKED: s_load_dword s -; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN-LABEL: {{^}}image_store_v4f16: +; UNPACKED: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[LO]], 16 +; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[HI]], 16 ; UNPACKED: s_and_b32 ; UNPACKED: s_and_b32 ; UNPACKED: image_store v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 -; PACKED: s_load_dword [[DATA0:s[0-9]+]] -; PACKED: s_load_dword [[DATA1:s[0-9]+]] -; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] -; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: s_load_dwordx2 s{{\[}}[[DATA0:[0-9]+]]:[[DATA1:[0-9]+]]{{\]}} +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[DATA1]] ; PACKED: image_store v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: @@ -98,19 +93,17 @@ ret void } -; GCN-LABEL: {{^}}image_store_mip_v4f16 -; UNPACKD: s_load_dword s -; UNPACKD: s_load_dword s -; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 -; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN-LABEL: {{^}}image_store_mip_v4f16: +; UNPACKED: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[LO]], 16 +; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[HI]], 16 ; UNPACKED: s_and_b32 ; UNPACKED: s_and_b32 ; UNPACKED: image_store_mip v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 -; PACKED: s_load_dword [[DATA0:s[0-9]+]] -; PACKED: s_load_dword [[DATA1:s[0-9]+]] -; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] -; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: s_load_dwordx2 s{{\[}}[[DATA0:[0-9]+]]:[[DATA1:[0-9]+]]{{\]}} +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[DATA1]] ; PACKED: image_store_mip v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -29,22 +29,21 @@ } ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: -; GCN-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; GCN-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14 +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} -; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] ; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] -; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: Index: test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- test/CodeGen/AMDGPU/mad-mix-lo.ll +++ test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -94,9 +94,12 @@ ; GCN-LABEL: {{^}}v_mad_mix_v3f32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v4, v7 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> @@ -110,11 +113,11 @@ ; GCN-LABEL: {{^}}v_mad_mix_v4f32: ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float> @@ -145,14 +148,12 @@ ; FIXME: Should be packed into 2 registers per argument? ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: s_movk_i32 s6, 0x7e00 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, s6, 16, v2 -; GFX9-NEXT: v_mad_mixhi_f16 v0, v1, v4, v7 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 clamp -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> @@ -168,12 +169,12 @@ ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-DAG: v_mov_b32_e32 v0, v6 -; GFX9-DAG: v_mov_b32_e32 v1, v2 -; GFX9: s_setpc_b64 +; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> @@ -243,10 +244,14 @@ ret <2 x half> %cvt.result } +; FIXME: Handling undef 4th component ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt: -; GFX9: v_mad_mix_f32 v0, v0, v3, v6 op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v1, v1, v4, v7 op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v2, v2, v5, v8 op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] + +; GFX9: v_cvt_f16_f32 ; GFX9: v_cvt_f16_f32 ; GFX9: v_cvt_f16_f32 ; GFX9: v_cvt_f16_f32 Index: test/CodeGen/AMDGPU/mul.i16.ll =================================================================== --- test/CodeGen/AMDGPU/mul.i16.ll +++ test/CodeGen/AMDGPU/mul.i16.ll @@ -66,15 +66,10 @@ ; VI: v_mul_lo_u16 ; VI: v_mul_lo_u16 -; GFX9: v_and_b32 -; GFX9: v_and_b32 -; GFX9: v_lshl_or_b32 -; GFX9: v_lshl_or_b32 -; GFX9: v_lshl_or_b32 - -; GFX9: v_pk_mul_lo_u16 -; GFX9: v_pk_mul_lo_u16 -; GFX9: s_setpc_b64 +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_mul_lo_u16 +; GFX9-NEXT: v_pk_mul_lo_u16 +; GFX9-NEXT: s_setpc_b64 define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) { %r.val = mul <3 x i16> %a, %b ret <3 x i16> %r.val @@ -94,8 +89,8 @@ ; VI: v_or_b32_e32 ; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) { %r.val = mul <4 x i16> %a, %b Index: test/CodeGen/AMDGPU/reduce-build-vec-ext-to-ext-build-vec.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/reduce-build-vec-ext-to-ext-build-vec.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s + +; Make sure reduceBuildVecExtToExtBuildVec combine doesn't regress + +; code with legal v4i16. The v4i16 build_vector it produces will be +; custom lowered into an i32 based build_vector, producing a mess that +; nothing manages to put back together. + +; GCN-LABEL: {{^}}v2i16_to_i64: +; GFX9: s_waitcnt +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_setpc_b64 +define i64 @v2i16_to_i64(<2 x i16> %x, <2 x i16> %y) { + %x.add = add <2 x i16> %x, %y + %zext = zext <2 x i16> %x.add to <2 x i32> + %arst = bitcast <2 x i32> %zext to i64 + ret i64 %arst +} Index: test/CodeGen/AMDGPU/select-vectors.ll =================================================================== --- test/CodeGen/AMDGPU/select-vectors.ll +++ test/CodeGen/AMDGPU/select-vectors.ll @@ -110,13 +110,9 @@ ; SI: cndmask ; SI-NOT: cndmask -; GFX9: v_cndmask_b32_e32 -; GFX9: cndmask -; GFX9-NOT: cndmask - -; VI: v_cndmask_b32 -; VI: v_cndmask_b32 -; VI: v_cndmask_b32 +; GFX89: v_cndmask_b32_e32 +; GFX89: cndmask +; GFX89-NOT: cndmask define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr Index: test/CodeGen/AMDGPU/sminmax.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -112,15 +112,15 @@ } ; GCN-LABEL: {{^}}s_abs_v4i16: -; GFX9: s_load_dword [[VAL0:s[0-9]+]] -; GFX9: s_load_dword [[VAL1:s[0-9]+]] -; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, [[VAL0]] -; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], [[VAL0]], [[SUB0]] -; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 +; GFX9: s_load_dwordx2 s{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, s[0:1], 0x2c +; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[VAL0]] +; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]] -; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]] -; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]] -; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 +; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]] +; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]] + +; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0] +; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0] define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { %z0 = insertelement <4 x i16> undef, i16 0, i16 0 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 @@ -197,8 +197,8 @@ ; GCN-LABEL: {{^}}s_min_max_v4i16: ; GFX9: v_pk_max_i16 -; GFX9: v_pk_max_i16 ; GFX9: v_pk_min_i16 +; GFX9: v_pk_max_i16 ; GFX9: v_pk_min_i16 define amdgpu_kernel void @s_min_max_v4i16(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { %cond0 = icmp sgt <4 x i16> %val0, %val1