diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -360,6 +360,8 @@ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom); @@ -1408,6 +1410,11 @@ Start != 1) return Op; + if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) || + (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) && + (Start == 0 || Start == 4)) + return Op; + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -134,6 +134,8 @@ addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -269,7 +271,8 @@ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, - MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { + MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64, + MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -611,7 +614,8 @@ if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { + for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, + MVT::v8f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -673,6 +677,21 @@ setOperationAction(ISD::STORE, MVT::v4f16, Promote); AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v8i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v8f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v8i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v8f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); @@ -682,6 +701,10 @@ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand); + if (!Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -699,9 +722,20 @@ setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); + setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); + setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand); + setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand); + + for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand); + } } if (Subtarget->hasVOP3PInsts()) { @@ -735,34 +769,42 @@ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); - setOperationAction(ISD::SHL, MVT::v4i16, Custom); - setOperationAction(ISD::SRA, MVT::v4i16, Custom); - setOperationAction(ISD::SRL, MVT::v4i16, Custom); - setOperationAction(ISD::ADD, MVT::v4i16, Custom); - setOperationAction(ISD::SUB, MVT::v4i16, Custom); - setOperationAction(ISD::MUL, MVT::v4i16, Custom); + for (MVT VT : { MVT::v4i16, MVT::v8i16 }) { + // Split vector operations. + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); - setOperationAction(ISD::SMIN, MVT::v4i16, Custom); - setOperationAction(ISD::SMAX, MVT::v4i16, Custom); - setOperationAction(ISD::UMIN, MVT::v4i16, Custom); - setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); - setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom); - setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom); - setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom); - setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + } - setOperationAction(ISD::FADD, MVT::v4f16, Custom); - setOperationAction(ISD::FMUL, MVT::v4f16, Custom); - setOperationAction(ISD::FMA, MVT::v4f16, Custom); + for (MVT VT : { MVT::v4f16, MVT::v8f16 }) { + // Split vector operations. + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Custom); + } setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom); setOperationAction(ISD::FEXP, MVT::v2f16, Custom); setOperationAction(ISD::SELECT, MVT::v4i16, Custom); @@ -799,7 +841,8 @@ setOperationAction(ISD::FABS, MVT::v2f16, Custom); } - for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { + for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, + MVT::v8i16, MVT::v8f16 }) { setOperationAction(ISD::SELECT, VT, Custom); } @@ -4610,7 +4653,8 @@ unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); + VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4631,21 +4675,26 @@ SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || + VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; - std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Op0 = Op.getOperand(0); + std::tie(Lo0, Hi0) = Op0.getValueType().isVector() + ? DAG.SplitVectorOperand(Op.getNode(), 0) + : std::make_pair(Op0, Op0); SDValue Lo1, Hi1; std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); SDValue Lo2, Hi2; std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); SDLoc SL(Op); + auto ResVT = DAG.GetSplitDestVTs(VT); - SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2, + SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags()); - SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2, + SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags()); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); @@ -5307,7 +5356,7 @@ if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -5709,7 +5758,6 @@ EVT VecVT = Vec.getValueType(); unsigned VecSize = VecVT.getSizeInBits(); EVT EltVT = VecVT.getVectorElementType(); - assert(VecSize <= 64); DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); @@ -5720,6 +5768,28 @@ if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; + if (VecSize == 128) { + SDValue Lo, Hi; + EVT LoVT, HiVT; + SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); + Lo = + DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, + V2, DAG.getConstant(0, SL, MVT::i32))); + Hi = + DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, + V2, DAG.getConstant(1, SL, MVT::i32))); + EVT IdxVT = Idx.getValueType(); + unsigned NElem = VecVT.getVectorNumElements(); + assert(isPowerOf2_32(NElem)); + SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT); + SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask); + SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx); + } + + assert(VecSize <= 64); + unsigned EltSize = EltVT.getSizeInBits(); assert(isPowerOf2_32(EltSize)); @@ -5802,20 +5872,27 @@ SDLoc SL(Op); EVT VT = Op.getValueType(); - if (VT == MVT::v4i16 || VT == MVT::v4f16) { - EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8i16 || VT == MVT::v8f16) { + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 2); + MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits()); // Turn into pair of packed build_vectors. // TODO: Special case for constants that can be materialized with s_mov_b64. - SDValue Lo = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(0), Op.getOperand(1) }); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(2), Op.getOperand(3) }); + SmallVector LoOps, HiOps; + for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) { + LoOps.push_back(Op.getOperand(I)); + HiOps.push_back(Op.getOperand(I + E)); + } + SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps); - SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); - SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi); - SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL, + { CastLo, CastHi }); return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } @@ -8427,6 +8504,9 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); + if (VT.getSizeInBits() == 128) + return splitTernaryVectorOp(Op, DAG); + assert(VT.getSizeInBits() == 64); SDLoc DL(Op); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1192,6 +1192,26 @@ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) >; +def : Pat < + (extract_subvector v8i16:$vec, (i32 0)), + (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1)) +>; + +def : Pat < + (extract_subvector v8i16:$vec, (i32 4)), + (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3)) +>; + +def : Pat < + (extract_subvector v8f16:$vec, (i32 0)), + (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1)) +>; + +def : Pat < + (extract_subvector v8f16:$vec, (i32 4)), + (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) +>; + foreach Index = 0-31 in { def Extract_Element_v32i32_#Index : Extract_Element < i32, v32i32, Index, !cast(sub#Index) @@ -1287,6 +1307,26 @@ def : BitConvert ; def : BitConvert ; def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; // 160-bit bitcast def : BitConvert ; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -617,7 +617,7 @@ let HasSGPR = 1; } -def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, +def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -784,7 +784,7 @@ } defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>; -defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>; +defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>; defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>; defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>; defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>; @@ -824,7 +824,7 @@ defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], (add VGPR_64)>; defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>; -defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>; +defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>; defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>; defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>; @@ -846,7 +846,7 @@ defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16], (add AGPR_64)>; defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>; -defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>; +defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>; defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>; defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>; defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>; diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll @@ -76,7 +76,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef -; FAST16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17i16 = add <17 x i16> undef, undef +; FAST16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17i16 = add <17 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW16-LABEL: 'add_i16' @@ -98,7 +98,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17i16 = add <17 x i16> undef, undef +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17i16 = add <17 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW16-SIZE-LABEL: 'add_i16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll @@ -55,10 +55,10 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -113,10 +113,10 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -235,10 +235,10 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -293,10 +293,10 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll @@ -55,10 +55,10 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -113,10 +113,10 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -235,10 +235,10 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) @@ -293,10 +293,10 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll @@ -299,33 +299,19 @@ } define void @sitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { -; FAST-LABEL: 'sitofp8' -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void -; -; SLOW-LABEL: 'sitofp8' -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void -; -; FAST-SIZE-LABEL: 'sitofp8' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; ALL-LABEL: 'sitofp8' +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; -; SLOW-SIZE-LABEL: 'sitofp8' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; ALL-SIZE-LABEL: 'sitofp8' +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = sitofp <8 x i1> %a to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = sitofp <8 x i8> %b to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = sitofp <8 x i16> %c to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = sitofp <8 x i32> %d to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %A1 = sitofp <8 x i1> %a to <8 x float> %B1 = sitofp <8 x i8> %b to <8 x float> @@ -391,33 +377,19 @@ } define void @uitofp8(<8 x i1> %a, <8 x i8> %b, <8 x i16> %c, <8 x i32> %d) { -; FAST-LABEL: 'uitofp8' -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> -; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void -; -; SLOW-LABEL: 'uitofp8' -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> -; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void -; -; FAST-SIZE-LABEL: 'uitofp8' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; ALL-LABEL: 'uitofp8' +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> +; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; -; SLOW-SIZE-LABEL: 'uitofp8' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; ALL-SIZE-LABEL: 'uitofp8' +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %A1 = uitofp <8 x i1> %a to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %B1 = uitofp <8 x i8> %b to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %C1 = uitofp <8 x i16> %c to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %D1 = uitofp <8 x i32> %d to <8 x float> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %A1 = uitofp <8 x i1> %a to <8 x float> %B1 = uitofp <8 x i8> %b to <8 x float> diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -115,7 +115,7 @@ ; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fadd <17 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f16' @@ -135,7 +135,7 @@ ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fadd <17 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fadd_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -165,7 +165,7 @@ ; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ieee' @@ -185,7 +185,7 @@ ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f16 = fdiv half undef, undef @@ -216,7 +216,7 @@ ; FP16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz' @@ -236,7 +236,7 @@ ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef -; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v17f16 = fdiv <17 x half> undef, undef +; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %v17f16 = fdiv <17 x half> undef, undef ; FP16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f16 = fdiv half undef, undef diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -1,34 +1,34 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64,NOPACKEDF32 %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,FASTF64,PACKEDF32 %s -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64,NOPACKEDF32 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,FASTF64 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST,SLOWF64 %s ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE,NOPACKEDF32-SIZE %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,FASTF64-SIZE,PACKEDF32-SIZE %s -; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE,NOPACKEDF32-SIZE %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,FASTF64-SIZE %s +; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST-SIZE,SLOWF64-SIZE %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW-SIZE %s ; END. define amdgpu_kernel void @fma_f32() #0 { -; NOPACKEDF32-LABEL: 'fma_f32' -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void -; -; PACKEDF32-LABEL: 'fma_f32' -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 -; PACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; SLOWF64-LABEL: 'fma_f32' +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; +; FASTF64-LABEL: 'fma_f32' +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f32' ; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 @@ -40,25 +40,25 @@ ; SLOW-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; -; NOPACKEDF32-SIZE-LABEL: 'fma_f32' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -; -; PACKEDF32-SIZE-LABEL: 'fma_f32' -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 -; PACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; SLOWF64-SIZE-LABEL: 'fma_f32' +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; FASTF64-SIZE-LABEL: 'fma_f32' +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) #2 +; FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f32' ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #2 @@ -145,7 +145,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2 -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 +; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f16' @@ -165,7 +165,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2 -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -115,7 +115,7 @@ ; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fmul <17 x half> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f16' @@ -135,7 +135,7 @@ ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fmul <17 x half> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll b/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fptosi.ll @@ -59,7 +59,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; SLOW-LABEL: 'fptosi_double_i16' @@ -73,7 +73,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLOW-SIZE-LABEL: 'fptosi_double_i16' @@ -181,8 +181,8 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16 ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; SLOW-LABEL: 'fptosi_float_i16' @@ -197,8 +197,8 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptosi <2 x float> undef to <2 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLOW-SIZE-LABEL: 'fptosi_float_i16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll b/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fptoui.ll @@ -59,7 +59,7 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; SLOW-LABEL: 'fptoui_double_i16' @@ -73,7 +73,7 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLOW-SIZE-LABEL: 'fptoui_double_i16' @@ -181,8 +181,8 @@ ; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 ; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> -; FAST-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> +; FAST-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; ; SLOW-LABEL: 'fptoui_float_i16' @@ -197,8 +197,8 @@ ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16 ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = fptoui <2 x float> undef to <2 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16> +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16> ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLOW-SIZE-LABEL: 'fptoui_float_i16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -115,7 +115,7 @@ ; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fsub <17 x half> undef, undef ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f16' @@ -135,7 +135,7 @@ ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v17f16 = fsub <17 x half> undef, undef ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fsub_f16' diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll --- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll @@ -79,7 +79,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef -; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v17i16 = mul <17 x i16> undef, undef +; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17i16 = mul <17 x i16> undef, undef ; FAST16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW16-SIZE-LABEL: 'mul_i16' @@ -99,7 +99,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = mul <4 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = mul <5 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = mul <16 x i16> undef, undef -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v17i16 = mul <17 x i16> undef, undef +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = mul <17 x i16> undef, undef ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %i16 = mul i16 undef, undef @@ -144,7 +144,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -182,7 +182,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -244,7 +244,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -282,7 +282,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -344,7 +344,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -382,7 +382,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, @@ -444,7 +444,7 @@ ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16 ; FAST16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, @@ -482,7 +482,7 @@ ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, -; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = mul <32 x i16> undef, +; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32i16 = mul <32 x i16> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16 ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, ; FAST16-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir @@ -715,27 +715,27 @@ ; GFX6-LABEL: name: load_constant_v8s16 ; GFX6: liveins: $sgpr0_sgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4) - ; GFX6-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX6-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4) + ; GFX6-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] ; GFX7-LABEL: name: load_constant_v8s16 ; GFX7: liveins: $sgpr0_sgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4) - ; GFX7-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX7-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4) + ; GFX7-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] ; GFX8-LABEL: name: load_constant_v8s16 ; GFX8: liveins: $sgpr0_sgpr1 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4) - ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4) + ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] ; GFX10-LABEL: name: load_constant_v8s16 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:sgpr_128(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load (<8 x s16>), align 4, addrspace 4) - ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4) + ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]] %0:sgpr(p4) = COPY $sgpr0_sgpr1 %1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4) $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -741,27 +741,27 @@ ; GFX7-LABEL: name: load_flat_v8s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) + ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX8-LABEL: name: load_flat_v8s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) + ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX9-LABEL: name: load_flat_v8s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) + ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX10-LABEL: name: load_flat_v8s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 0) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir @@ -873,33 +873,33 @@ ; GFX7-LABEL: name: load_global_v8s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX7-FLAT-LABEL: name: load_global_v8s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX8-LABEL: name: load_global_v8s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX9-LABEL: name: load_global_v8s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] ; GFX10-LABEL: name: load_global_v8s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 1) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir @@ -1,7 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s --- @@ -27,6 +27,12 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]] + ; GFX10-LABEL: name: load_local_v4s32_align16 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 16, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -57,6 +63,12 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] + ; GFX10-LABEL: name: load_local_v4s32_align_8 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -87,6 +99,12 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 50, 51, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] + ; GFX10-LABEL: name: load_local_v4s32_align_8_offset_160 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 50, 51, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 400 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -123,6 +141,14 @@ ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] + ; GFX10-LABEL: name: load_local_v4s32_align_8_offset_320 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec + ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load (<4 x s32>), align 8, addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4000 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -155,6 +181,12 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<2 x s64>), align 8, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] + ; GFX10-LABEL: name: load_local_v2s64 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<2 x s64>), align 8, addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -185,6 +217,12 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load (<2 x p1>), align 8, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; GFX10-LABEL: name: load_local_v2p1 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load (<2 x p1>), align 8, addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -215,6 +253,12 @@ ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load (s128), align 8, addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; GFX10-LABEL: name: load_local_s128 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load (s128), align 8, addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -235,16 +279,22 @@ ; GFX7-LABEL: name: load_local_v8s16 ; GFX7: liveins: $vgpr0 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: $m0 = S_MOV_B32 -1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX7-NEXT: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<8 x s16>), align 8, addrspace 3) + ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]] ; GFX9-LABEL: name: load_local_v8s16 ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3) - ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<8 x s16>), align 8, addrspace 3) + ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] + ; GFX10-LABEL: name: load_local_v8s16 + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<8 x s16>), align 8, addrspace 3) + ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 8, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -503,27 +503,27 @@ ; GFX7-LABEL: name: store_flat_v8s16 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>)) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) ; GFX8-LABEL: name: store_flat_v8s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX8-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>)) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) ; GFX9-LABEL: name: store_flat_v8s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>)) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) ; GFX10-LABEL: name: store_flat_v8s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>)) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>)) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir @@ -549,33 +549,33 @@ ; GFX7-LABEL: name: store_global_v8s16 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_v8s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1) + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1) ; GFX8-LABEL: name: store_global_v8s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} - ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX8-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1) + ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1) ; GFX9-LABEL: name: store_global_v8s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1) ; GFX10-LABEL: name: store_global_v8s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX10-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store (<8 x s16>), align 16, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -200,12 +200,13 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64: +; GFX9PLUS: v_mov_b32_e32 [[MASK:v[0-9+]]], 0xffff ; GFX9PLUS: global_load_dword [[A:v[0-9]+]] ; GFX9PLUS: global_load_dword [[B:v[0-9]+]] ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] -; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] -; GFX9PLUS-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]] +; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9PLUS: buffer_store_dwordx4 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll --- a/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll @@ -23,7 +23,6 @@ ; GCN-LABEL: {{^}}test_vector_creation: ; GCN: global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}], ; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[HI:[13579]]], v{{[0-9]+}} ; GCN: global_store_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v[{{[0-9]*[02468]:[0-9]*[13579]}}] define amdgpu_kernel void @test_vector_creation() { entry: diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -0,0 +1,437 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s + +define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { +; SI-LABEL: extract_4xi16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cbranch_scc0 .LBB0_2 +; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB0_3 +; SI-NEXT: s_branch .LBB0_4 +; SI-NEXT: .LBB0_2: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB0_3: ; %T +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: .LBB0_4: ; %exit +; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_mov_b32_e32 v3, 0x8000 +; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v5, 1 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000 +; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_4xi16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %F +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: s_branch .LBB0_4 +; GFX9-NEXT: .LBB0_2: +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: .LBB0_3: ; %T +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: .LBB0_4: ; %exit +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + br i1 undef, label %T, label %F + +T: + %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 + br label %exit + +F: + %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 + br label %exit + +exit: + %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] + %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> + %b2 = icmp sgt <4 x i16> %v2, + %r2 = select <4 x i1> %b2, <4 x i16> , <4 x i16> + ret <4 x i16> %r2 +} + +define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) { +; SI-LABEL: extract_4xi16_2: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cbranch_scc0 .LBB1_2 +; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB1_3 +; SI-NEXT: s_branch .LBB1_4 +; SI-NEXT: .LBB1_2: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB1_3: ; %T +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v4, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; SI-NEXT: .LBB1_4: ; %exit +; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 +; SI-NEXT: v_bfe_i32 v1, v5, 0, 16 +; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v3, v4, 0, 16 +; SI-NEXT: v_mov_b32_e32 v4, 0xffff +; SI-NEXT: v_mov_b32_e32 v5, 0x8000 +; SI-NEXT: v_mov_b32_e32 v6, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v7, 1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_4xi16_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %F +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-NEXT: s_branch .LBB1_4 +; GFX9-NEXT: .LBB1_2: +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: .LBB1_3: ; %T +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: .LBB1_4: ; %exit +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + br i1 undef, label %T, label %F + +T: + %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0 + br label %exit + +F: + %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1 + br label %exit + +exit: + %m = phi <8 x i16> [ %t, %T ], [ %f, %F ] + %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> + %b2 = icmp sgt <4 x i16> %v2, + %r2 = select <4 x i1> %b2, <4 x i16> , <4 x i16> + ret <4 x i16> %r2 +} + +define <4 x half> @extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) { +; SI-LABEL: extract_4xf16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_cbranch_scc0 .LBB2_2 +; SI-NEXT: ; %bb.1: ; %F +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_or_b32_e32 v4, v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_mov_b64 vcc, exec +; SI-NEXT: s_cbranch_execz .LBB2_3 +; SI-NEXT: s_branch .LBB2_4 +; SI-NEXT: .LBB2_2: +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: .LBB2_3: ; %T +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v4, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: .LBB2_4: ; %exit +; SI-NEXT: v_cvt_f16_f32_e32 v0, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000 +; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0 +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2 +; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_4xf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %F +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: s_branch .LBB2_4 +; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_mov_b32 s10, s8 +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: .LBB2_3: ; %T +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: .LBB2_4: ; %exit +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc +; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 +; GFX9-NEXT: s_setpc_b64 s[30:31] + br i1 undef, label %T, label %F + +T: + %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0 + br label %exit + +F: + %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1 + br label %exit + +exit: + %m = phi <8 x half> [ %t, %T ], [ %f, %F ] + %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> + %b2 = fcmp ugt <4 x half> %v2, + %r2 = select <4 x i1> %b2, <4 x half> , <4 x half> + ret <4 x half> %r2 +} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -166,6 +166,19 @@ ret void } +; GCN-LABEL: {{^}}v_extractelement_v8f16_dynamic_sgpr: +; GCN-COUNT-7: v_cndmask_b32_e32 +define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(half addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %n) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext + %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep + %vec.extract = extractelement <8 x half> %vec, i32 %n + store half %vec.extract, half addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -166,6 +166,55 @@ ret void } +; GCN-LABEL: {{^}}v_extractelement_v8i16_2: +; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 +; SI: buffer_store_short [[RES]] +; VI: flat_load_dword [[RES:v[0-9]+]] +; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] +; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 +; GFX9: global_store_short v{{[0-9]+}}, [[RES]] +define amdgpu_kernel void @v_extractelement_v8i16_2(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep + %vec.extract = extractelement <8 x i16> %vec, i32 2 + store i16 %vec.extract, i16 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_extractelement_v8i16_6: +; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 +; SI: buffer_store_short [[RES]] +; VI: flat_load_dword [[RES:v[0-9]+]] +; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] +; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 +; GFX9: global_store_short v{{[0-9]+}}, [[RES]] +define amdgpu_kernel void @v_extractelement_v8i16_6(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep + %vec.extract = extractelement <8 x i16> %vec, i32 6 + store i16 %vec.extract, i16 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_extractelement_v8i16_dynamic_sgpr: +; GCN-COUNT-7: v_cndmask_b32_e32 +define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(i16 addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %n) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext + %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep + %vec.extract = extractelement <8 x i16> %vec, i32 %n + store i16 %vec.extract, i16 addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -388,9 +388,7 @@ ; FIXME: Mixing buffer and global ; FIXME: Should not scalarize ; GCN-LABEL: {{^}}v5i16_func_void: -; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_short_d16 v2 +; GFX9: buffer_load_dwordx4 v[0:3] ; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2289,58 +2289,58 @@ ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 20, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 28, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 4, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 20, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 +; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v18 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX8-NEXT: v_lshlrev_b16_e32 v11, 12, v11 +; GFX8-NEXT: v_lshlrev_b16_e32 v17, 12, v17 +; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_mad_u16 v2, v6, v11, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX8-NEXT: v_mad_u16 v2, v7, v12, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_mad_u16 v2, v8, v13, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_mad_u16 v2, v17, v5, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v16 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 -; GFX8-NEXT: v_mad_u16 v2, v9, v14, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2354,7 +2354,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] @@ -2362,68 +2362,70 @@ ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 -; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v11, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v18, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX9-NEXT: v_and_b32_e32 v17, v4, v17 -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17 -; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX9-NEXT: v_and_b32_e32 v15, v4, v15 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v13 -; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v8, v14, 16, v15 -; GFX9-NEXT: v_lshl_or_b32 v4, v12, 16, v4 -; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 20, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v9 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v12 +; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v13 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v15 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v16 +; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v18 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v2, v1, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v6, v9, v6 +; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 +; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v8 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v4 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -2437,7 +2439,7 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 12 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] @@ -2445,68 +2447,70 @@ ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v18, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v13 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v14, 16, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v12, 16, v4 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 20, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v7 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v8 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v9 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v10 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v11 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v12 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v13 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v14 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v15 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v16 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v18 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v9, v6 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v8 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2529,71 +2533,85 @@ ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v11, v1, 4, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v14, v2, 24, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 12, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 8, 4 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v6, v4, v6 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v14 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v15 ; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v17 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13 ; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v18, 16, v2 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v17 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v14 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v16 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, v4, v14 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v18, 12, v18 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v18 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v6 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v2, v3, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-XNACK-NEXT: s_endpgm @@ -2617,71 +2635,85 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v11, v1, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v14, v0, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v6, v4, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v15 ; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13 ; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v18, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v17 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v6 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v14 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v12, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, v4, v14 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v18, 12, v18 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v18 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2187,32 +2187,32 @@ ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 20, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 4, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 +; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u16 v4, v5, v12, v4 -; GFX8-NEXT: v_mad_u16 v4, v6, v13, v4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX8-NEXT: v_mad_u16 v4, v7, v14, v4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX8-NEXT: v_mad_u16 v4, v8, v15, v4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX8-NEXT: v_mad_u16 v4, v9, v16, v4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 24, 4 -; GFX8-NEXT: v_mad_u16 v4, v10, v17, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX8-NEXT: v_mad_u16 v4, v11, v18, v4 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2234,52 +2234,52 @@ ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v18, 15, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX9-NEXT: v_and_b32_e32 v5, v4, v5 -; GFX9-NEXT: v_and_b32_e32 v14, v4, v14 -; GFX9-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX9-NEXT: v_and_b32_e32 v16, v4, v16 -; GFX9-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX9-NEXT: v_and_b32_e32 v18, v4, v18 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v11 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v18 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_lshl_or_b32 v11, v13, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v6, v15, 16, v14 -; GFX9-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v17, 16, v16 -; GFX9-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_and_b32_e32 v17, v4, v17 +; GFX9-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX9-NEXT: v_and_b32_e32 v15, v4, v15 +; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v13, v4, v13 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v2, v1, v3 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v11 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v6 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, v9, v8 +; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v4 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -2301,52 +2301,52 @@ ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v18, 15, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX9-DL-NEXT: v_and_b32_e32 v5, v4, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v14, v4, v14 -; GFX9-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX9-DL-NEXT: v_and_b32_e32 v16, v4, v16 -; GFX9-DL-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX9-DL-NEXT: v_and_b32_e32 v18, v4, v18 -; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v11 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v18 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v11, v13, 16, v12 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v14 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v17, 16, v16 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v10, 16, v9 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17 +; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, v4, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v6 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX9-DL-NEXT: v_lshl_or_b32 v10, v14, 16, v15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v2, v1, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v11 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v6 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v9, v8 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2372,52 +2372,52 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7 ; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 ; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9 +; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2575,52 +2575,52 @@ ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 28, v3 -; GFX8-NEXT: v_bfe_u32 v9, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 +; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_bfe_u32 v12, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX8-NEXT: v_and_b32_e32 v11, 15, v3 -; GFX8-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 12, 4 -; GFX8-NEXT: v_and_b32_e32 v18, 15, v2 -; GFX8-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX8-NEXT: v_mul_lo_u16_e32 v19, v5, v12 -; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v13, v7, v14 -; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v9, v9, v16 -; GFX8-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_sdwa v15, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v19, v6 -; GFX8-NEXT: v_or_b32_e32 v6, v13, v8 -; GFX8-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX8-NEXT: v_mul_lo_u16_e32 v11, v11, v18 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v9, v11, v15 -; GFX8-NEXT: v_or_b32_e32 v10, v15, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2 +; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4 +; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4 +; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17 +; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v3 +; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19 +; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15 +; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v9, v18, v9 +; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v2, v11 +; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v3, v9, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v10 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v8 +; GFX8-NEXT: v_add_u16_e32 v3, v6, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v6 -; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 +; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9 +; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v9 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2641,52 +2641,52 @@ ; GFX9-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v0, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1 +; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v10, 15, v1 -; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v17, 15, v2 -; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v12, v6, v13 -; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v7, v12, v7 -; GFX9-NEXT: v_or_b32_e32 v1, v18, v0 -; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-NEXT: v_or_b32_e32 v9, v10, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2 +; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16 +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2 +; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v8, v17, v8 +; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 +; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v18, v10 +; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 +; GFX9-NEXT: v_or_b32_e32 v7, v12, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v1, v9, v4 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-NEXT: v_add_u16_e32 v1, v5, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v7 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_mad_legacy_u16 v0, v6, v13, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -2707,52 +2707,52 @@ ; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[2:3] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v12, v6, v13 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v5, v11 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v7 -; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v0 -; GFX9-DL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX9-DL-NEXT: v_or_b32_e32 v9, v10, v2 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v8, v17, v8 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v1, v18, v10 +; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12 +; GFX9-DL-NEXT: v_or_b32_e32 v7, v12, v0 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v1, v9, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v5, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v7 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v5, v11, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v7 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v6, v13, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2774,55 +2774,55 @@ ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 ; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v9, v9, v10 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 24, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v1 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 16, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15 -; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX10-DL-NEXT: v_mul_lo_u16 v9, v0, v10 -; GFX10-DL-NEXT: v_mul_lo_u16 v10, v6, v13 -; GFX10-DL-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v10 +; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 4, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v13 +; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 +; GFX10-DL-NEXT: v_mul_lo_u16 v2, v8, v14 +; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX10-DL-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX10-DL-NEXT: v_mul_lo_u16 v1, v11, v13 +; GFX10-DL-NEXT: v_mul_lo_u16 v7, v9, v15 +; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v0 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v10 +; GFX10-DL-NEXT: v_mul_lo_u16 v10, v12, v16 ; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v12 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v11, v9 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; GFX10-DL-NEXT: v_or_b32_e32 v7, v7, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX10-DL-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v1, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u16 v9, v3, v10 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v2 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v12, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v12, v16, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v9, v15, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm @@ -2941,40 +2941,32 @@ ; GFX8-NEXT: s_add_u32 s8, s8, s3 ; GFX8-NEXT: s_addc_u32 s9, s9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 20, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 4, 4 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX8-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX8-NEXT: v_mul_u32_u24_e32 v6, v6, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 +; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 +; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v4, v5, v4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX8-NEXT: v_mul_u32_u24_e32 v7, v7, v14 -; GFX8-NEXT: v_add_u16_e32 v4, v4, v6 -; GFX8-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX8-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX8-NEXT: v_add_u16_e32 v4, v4, v7 -; GFX8-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX8-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX8-NEXT: v_add_u16_e32 v4, v4, v8 -; GFX8-NEXT: v_bfe_u32 v11, v3, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX8-NEXT: v_bfe_u32 v18, v2, 24, 4 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX8-NEXT: v_mul_u32_u24_e32 v10, v10, v17 -; GFX8-NEXT: v_add_u16_e32 v4, v4, v9 -; GFX8-NEXT: v_mul_u32_u24_e32 v2, v3, v2 -; GFX8-NEXT: v_mul_u32_u24_e32 v3, v11, v18 -; GFX8-NEXT: v_add_u16_e32 v4, v4, v10 -; GFX8-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 +; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 +; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2989,47 +2981,60 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_and_b32_e32 v17, v4, v17 +; GFX9-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX9-NEXT: v_and_b32_e32 v15, v4, v15 +; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v13, v4, v13 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v6 -; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v7 -; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v8 -; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, v10, v17 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v9 -; GFX9-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm @@ -3044,106 +3049,135 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 ; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17 +; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, v4, v13 +; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v6 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v12, 16, v13 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; GFX9-DL-NEXT: v_lshl_or_b32 v10, v14, 16, v15 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v6 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v7 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v8 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v10, v17 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v9 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 ; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s8, s8, s3 -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 8, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 +; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 +; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 +; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 12, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v8 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 16, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v7 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 20, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v8 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9 +; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v8 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3] +; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, i4 addrspace(1)* nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -20,9 +20,9 @@ ; GCN: error: couldn't allocate output register for constraint 's' ; GCN: error: couldn't allocate input reg for constraint 's' -define amdgpu_kernel void @s_input_output_v8f16() { - %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() - tail call void asm sideeffect "; use $0", "s"(<8 x half> %v) +define amdgpu_kernel void @s_input_output_v16f16() { + %v = tail call <16 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() + tail call void asm sideeffect "; use $0", "s"(<16 x half> %v) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1745,6 +1745,345 @@ ret void } +define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) { +; GFX9-LABEL: v_insertelement_v8f16_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v8f16_3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:16 +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4 +; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_bfi_b32 v3, s4, v3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_insertelement_v8f16_3: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: s_lshl_b32 s0, s4, 16 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3 + store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) { +; GFX9-LABEL: v_insertelement_v8i16_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_bfi_b32 v3, v6, v5, v3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v8i16_6: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s4 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:16 +; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v5, s9 +; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_bfi_b32 v1, s4, v1, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfi_b32 v3, s4, v6, v3 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_insertelement_v8i16_6: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: v_mov_b32_e32 v6, s4 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_bfi_b32 v3, s0, v6, v3 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %out, i64 %tid.ext + %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to i16 + %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6 + store <8 x i16> %vecins, <8 x i16> addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val, i32 %n) { +; GFX9-LABEL: v_insertelement_v8f16_dynamic: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: v_lshl_or_b32 v3, v7, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: v_lshl_or_b32 v2, v8, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v6, vcc +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v8, 16, v0 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v8f16_dynamic: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 +; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: s_cmp_eq_u32 s5, 6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 4 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 5 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 0 +; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s5, 1 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; CI-LABEL: v_insertelement_v8f16_dynamic: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 +; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; CI-NEXT: s_cmp_eq_u32 s5, 7 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 6 +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 5 +; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 4 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 2 +; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 1 +; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: s_cmp_eq_u32 s5, 0 +; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc +; CI-NEXT: s_cselect_b64 vcc, -1, 0 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; CI-NEXT: v_or_b32_e32 v3, v3, v6 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; CI-NEXT: v_or_b32_e32 v2, v2, v7 +; CI-NEXT: v_or_b32_e32 v1, v1, v8 +; CI-NEXT: v_or_b32_e32 v0, v0, v6 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext + %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep + %val.trunc = trunc i32 %val to i16 + %val.cvt = bitcast i16 %val.trunc to half + %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n + store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1875,16 +1875,15 @@ ; ; GFX9-LABEL: v5i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x18 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_short v2, v3, s[2:3] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i16_arg: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -4727,10 +4727,10 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5581,14 +5581,14 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -571,6 +571,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc @@ -580,8 +581,8 @@ ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v0, v4, v2 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -617,6 +618,7 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -627,8 +629,8 @@ ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, v3, v2 +; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm