Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -454,6 +454,7 @@ SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); + SDValue convertBuildVecZextToZext(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); @@ -14959,6 +14960,54 @@ return Shuffles[0]; } +// Try to turn a build vector of zero extends of extract vector elts into a +// a vector zero extend and possibly an extract subvector. +// TODO: Support sign extend or any extend? +// TODO: Allow undef elements? +// TODO: Don't require the extracts to start at element 0. +SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { + if (LegalOperations) + return SDValue(); + + EVT VT = N->getValueType(0); + + SDValue Op0 = N->getOperand(0); + auto checkElem = [&](SDValue Op) -> uint64_t { + if (Op.getOpcode() == ISD::ZERO_EXTEND && + Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0)) + if (auto *C = dyn_cast(Op.getOperand(0).getOperand(1))) + return C->getZExtValue(); + return -1; + }; + + // Make sure the first element matches + // (zext (extract_vector_elt X, C)) + int Offset = checkElem(Op0); + if (Offset < 0) + return SDValue(); + + unsigned NumElems = N->getNumOperands(); + SDValue In = Op0.getOperand(0).getOperand(0); + EVT InSVT = In.getValueType().getScalarType(); + EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems); + + // Don't create an illegal input type after type legalization. + if (LegalTypes && !TLI.isTypeLegal(InVT)) + return SDValue(); + + // Ensure all the elements come from the same vector and are adjacent. + for (unsigned i = 1; i != NumElems; ++i) { + if (Offset + i != checkElem(N->getOperand(i))) + return SDValue(); + } + + SDLoc DL(N); + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In, + Op0.getOperand(0).getOperand(1)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, In); +} + SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { EVT VT = N->getValueType(0); @@ -15018,6 +15067,9 @@ Op0.getOperand(0), Op0.getOperand(1)); } + if (SDValue V = convertBuildVecZextToZext(N)) + return V; + if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1344,6 +1344,8 @@ setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + + setOperationAction(ISD::MUL, MVT::v8i64, Legal); } if (Subtarget.hasCDI()) { @@ -1432,6 +1434,8 @@ setOperationAction(ISD::UINT_TO_FP, VT, Legal); setOperationAction(ISD::FP_TO_SINT, VT, Legal); setOperationAction(ISD::FP_TO_UINT, VT, Legal); + + setOperationAction(ISD::MUL, VT, Legal); } } @@ -5117,10 +5121,11 @@ template SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef Ops, - F Builder) { + F Builder, bool CheckBWI = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if (Subtarget.useBWIRegs()) { + if ((CheckBWI && Subtarget.useBWIRegs()) || + (!CheckBWI && Subtarget.useAVX512Regs())) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -22516,13 +22521,7 @@ assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); - - // MULDQ returns the 64-bit result of the signed multiplication of the lower - // 32-bits. We can lower with this if the sign bits stretch that far. - if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 && - DAG.ComputeNumSignBits(B) > 32) { - return DAG.getNode(X86ISD::PMULDQ, dl, VT, A, B); - } + assert(!Subtarget.hasDQI() && "DQI should use MULLQ"); // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); @@ -22545,11 +22544,6 @@ bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero); bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero); - // If DQI is supported we can use MULLQ, but MULUDQ is still better if the - // the high bits are known to be zero. - if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero)) - return Op; - SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); // Only multiply lo/hi halves that aren't known to be zero. @@ -33181,6 +33175,47 @@ PMADDWDBuilder); } +static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + + EVT VT = N->getValueType(0); + + // Only support vXi64 vectors. + if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 || + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // MULDQ returns the 64-bit result of the signed multiplication of the lower + // 32-bits. We can lower with this if the sign bits stretch that far. + if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 && + DAG.ComputeNumSignBits(N1) > 32) { + auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, + PMULDQBuilder, /*CheckBWI*/false); + } + + // If the upper bits are zero we can use a single pmuludq. + APInt Mask = APInt::getHighBitsSet(64, 32); + if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) { + auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops); + }; + return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, + PMULDQBuilder, /*CheckBWI*/false); + } + + return SDValue(); +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -33191,6 +33226,9 @@ if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget)) return V; + if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget)) + return V; + if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); @@ -35793,7 +35831,7 @@ // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) && - !Subtarget.hasDQI()) + !TLI.isOperationLegal(Opcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { Index: test/CodeGen/ARM/dagcombine-anyexttozeroext.ll =================================================================== --- test/CodeGen/ARM/dagcombine-anyexttozeroext.ll +++ test/CodeGen/ARM/dagcombine-anyexttozeroext.ll @@ -36,18 +36,14 @@ ret float %3 } -; The backend generates for the following code an -; (and 0xff (i32 extract_vector_elt (zext load <4 x i8> to 4 x i16))) -; -; The and is not redundant and cannot be removed. Since -; extract_vector_elt is doing an implicit any_ext, the and -; is required to guarantee that the top bits are set to zero. - -; Ideally should be a zext from <4 x i8> to <4 x 32>. +; Make sure we generate zext from <4 x i8> to <4 x 32>. ; CHECK-LABEL: h: ; CHECK: vld1.32 -; CHECK: uxtb +; CHECK: vmovl.u8 q8, d16 +; CHECK: vmovl.u16 q8, d16 +; CHECK: vmov r0, r1, d16 +; CHECK: vmov r2, r3, d17 define <4 x i32> @h(<4 x i8> *%in) { %1 = load <4 x i8>, <4 x i8>* %in, align 4 %2 = extractelement <4 x i8> %1, i32 0 Index: test/CodeGen/X86/combine-pmuldq.ll =================================================================== --- test/CodeGen/X86/combine-pmuldq.ll +++ test/CodeGen/X86/combine-pmuldq.ll @@ -5,23 +5,14 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512DQVL -; TODO - shuffle+sext are superfluous define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: combine_shuffle_sext_pmuldq: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: pmovsxdq %xmm0, %xmm0 -; SSE-NEXT: pmuldq %xmm2, %xmm0 +; SSE-NEXT: pmuldq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_shuffle_sext_pmuldq: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 ; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> @@ -32,23 +23,14 @@ ret <2 x i64> %5 } -; TODO - shuffle+zext are superfluous define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: combine_shuffle_zext_pmuludq: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_shuffle_zext_pmuludq: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -472,7 +472,7 @@ ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 @@ -503,7 +503,7 @@ ; SKX_SMALL: # %bb.0: # %entry ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -515,7 +515,7 @@ ; SKX_LARGE-LABEL: test9: ; SKX_LARGE: # %bb.0: # %entry ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 -; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax @@ -558,7 +558,7 @@ ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 +; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 @@ -589,7 +589,7 @@ ; SKX_SMALL: # %bb.0: # %entry ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 @@ -601,7 +601,7 @@ ; SKX_LARGE-LABEL: test10: ; SKX_LARGE: # %bb.0: # %entry ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 -; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1 +; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax Index: test/CodeGen/X86/mulvi32.ll =================================================================== --- test/CodeGen/X86/mulvi32.ll +++ test/CodeGen/X86/mulvi32.ll @@ -167,22 +167,22 @@ ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: -; SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE42-NEXT: pmuludq %xmm0, %xmm1 -; SSE42-NEXT: pmuludq %xmm4, %xmm2 -; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE42-NEXT: pmuludq %xmm3, %xmm2 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmuludq %xmm4, %xmm0 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; ; AVX1-LABEL: _mul4xi32toi64a: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -191,13 +191,8 @@ ; ; AVX2-LABEL: _mul4xi32toi64a: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %f00 = extractelement <4 x i32> %0, i32 0 Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1435,7 +1435,7 @@ ; AVX512-LABEL: mul_v8i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = sext <8 x i16> %val1 to <8 x i64> Index: test/CodeGen/X86/xop-ifma.ll =================================================================== --- test/CodeGen/X86/xop-ifma.ll +++ test/CodeGen/X86/xop-ifma.ll @@ -81,8 +81,8 @@ ; ; XOP-AVX2-LABEL: test_mulx_v4i32_add_v4i64: ; XOP-AVX2: # %bb.0: -; XOP-AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; XOP-AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; XOP-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; XOP-AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; XOP-AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; XOP-AVX2-NEXT: retq