Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -782,8 +782,8 @@ setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); - setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); - setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::MULHU, MVT::v4i32, Custom); + setOperationAction(ISD::MULHS, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i8, Custom); setOperationAction(ISD::MULHS, MVT::v16i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); @@ -1087,9 +1087,8 @@ setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); - setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); - setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); - + setOperationAction(ISD::MULHU, MVT::v8i32, Custom); + setOperationAction(ISD::MULHS, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); @@ -1331,8 +1330,8 @@ setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); - setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom); - setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i32, Custom); + setOperationAction(ISD::MULHS, MVT::v16i32, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); @@ -22891,6 +22890,76 @@ if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntArith(Op, DAG); + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) { + assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || + (VT == MVT::v8i32 && Subtarget.hasInt256()) || + (VT == MVT::v16i32 && Subtarget.hasAVX512())); + SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); + + int NumElts = VT.getVectorNumElements(); + + // PMULxD operations multiply each even value (starting at 0) of LHS with + // the related value of RHS and produce a widen result. + // E.g., PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> + // + // In other word, to have all the results, we need to perform two PMULxD: + // 1. one with the even values. + // 2. one with the odd values. + // To achieve #2, with need to place the odd values at an even position. + // + // Place the odd value at an even position (basically, shift all values 1 + // step to the left): + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, + 9, -1, 11, -1, 13, -1, 15, -1}; + // => + SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, + makeArrayRef(&Mask[0], NumElts)); + // => + SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, + makeArrayRef(&Mask[0], NumElts)); + + // Emit two multiplies, one for the lower 2 ints and one for the higher 2 + // ints. + MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); + bool IsSigned = Op->getOpcode() == ISD::MULHS; + unsigned Opcode = + (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> + SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, + DAG.getBitcast(MulVT, Op0), + DAG.getBitcast(MulVT, Op1))); + // PMULUDQ <4 x i32> , <4 x i32> + // => <2 x i64> + SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, + DAG.getBitcast(MulVT, Odd0), + DAG.getBitcast(MulVT, Odd1))); + + // Shuffle it back into the right order. + SmallVector ShufMask(NumElts); + for (int i = 0; i != NumElts; ++i) { + ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; + } + + SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask); + + // If we have a signed multiply but no PMULDQ fix up the result of an + // unsigned multiply. + if (IsSigned && !Subtarget.hasSSE41()) { + SDValue ShAmt = DAG.getConstant(31, dl, VT); + SDValue T1 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); + SDValue T2 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); + + SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup); + } + + return Res; + } + // Only i8 vectors should need custom lowering after this. assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && @@ -23074,105 +23143,6 @@ return DAG.getBitcast(VT, CallInfo.first); } -static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); - MVT VT = Op0.getSimpleValueType(); - SDLoc dl(Op); - - // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget.hasInt256()) { - unsigned Opcode = Op.getOpcode(); - unsigned NumElems = VT.getVectorNumElements(); - MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2); - SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl); - SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl); - SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl); - SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl); - SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1); - SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1); - SDValue Ops[] = { - DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)), - DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1)) - }; - return DAG.getMergeValues(Ops, dl); - } - - assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || - (VT == MVT::v8i32 && Subtarget.hasInt256()) || - (VT == MVT::v16i32 && Subtarget.hasAVX512())); - - int NumElts = VT.getVectorNumElements(); - - // PMULxD operations multiply each even value (starting at 0) of LHS with - // the related value of RHS and produce a widen result. - // E.g., PMULUDQ <4 x i32> , <4 x i32> - // => <2 x i64> - // - // In other word, to have all the results, we need to perform two PMULxD: - // 1. one with the even values. - // 2. one with the odd values. - // To achieve #2, with need to place the odd values at an even position. - // - // Place the odd value at an even position (basically, shift all values 1 - // step to the left): - const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}; - // => - SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, - makeArrayRef(&Mask[0], NumElts)); - // => - SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, - makeArrayRef(&Mask[0], NumElts)); - - // Emit two multiplies, one for the lower 2 ints and one for the higher 2 - // ints. - MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); - bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; - unsigned Opcode = - (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; - // PMULUDQ <4 x i32> , <4 x i32> - // => <2 x i64> - SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, - DAG.getBitcast(MulVT, Op0), - DAG.getBitcast(MulVT, Op1))); - // PMULUDQ <4 x i32> , <4 x i32> - // => <2 x i64> - SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, - DAG.getBitcast(MulVT, Odd0), - DAG.getBitcast(MulVT, Odd1))); - - // Shuffle it back into the right order. - SmallVector HighMask(NumElts); - SmallVector LowMask(NumElts); - for (int i = 0; i != NumElts; ++i) { - HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; - LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts); - } - - SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); - - // If we have a signed multiply but no PMULDQ fix up the high parts of a - // unsigned multiply. - if (IsSigned && !Subtarget.hasSSE41()) { - SDValue ShAmt = DAG.getConstant( - 31, dl, - DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); - SDValue T1 = DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); - SDValue T2 = DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); - - SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); - Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); - } - - // The first result of MUL_LOHI is actually the low value, followed by the - // high value. - SDValue Ops[] = {Lows, Highs}; - return DAG.getMergeValues(Ops, dl); -} - // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, @@ -25569,8 +25539,6 @@ case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::MULHS: case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); - case ISD::UMUL_LOHI: - case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); case ISD::ROTL: case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: Index: test/CodeGen/X86/vector-idiv-sdiv-128.ll =================================================================== --- test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -78,22 +78,19 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { ; SSE2-LABEL: test_div7_4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrld $31, %xmm0 @@ -134,13 +131,12 @@ ; ; AVX2-LABEL: test_div7_4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1 ; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 @@ -384,33 +380,30 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrld $31, %xmm2 -; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrld $31, %xmm1 +; SSE2-NEXT: psrad $2, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [7,7,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_4i32: @@ -448,13 +441,12 @@ ; ; AVX2-LABEL: test_rem7_4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 Index: test/CodeGen/X86/vector-idiv-sdiv-256.ll =================================================================== --- test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -88,41 +88,37 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; AVX1-LABEL: test_div7_8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 -; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7] +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpsrld $31, %xmm0, %xmm2 ; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_div7_8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] +; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1 ; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0 @@ -363,46 +359,42 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX1-LABEL: test_rem7_8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7] +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $31, %xmm2, %xmm4 +; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 ; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 ; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7] -; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4 -; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_rem7_8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] +; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2 ; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1 Index: test/CodeGen/X86/vector-idiv-sdiv-512.ll =================================================================== --- test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -86,7 +86,6 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 -; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 ; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] @@ -313,7 +312,6 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] ; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 -; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 ; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] Index: test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- test/CodeGen/X86/vector-idiv-udiv-128.ll +++ test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -128,13 +128,12 @@ ; ; AVX2-LABEL: test_div7_4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -422,13 +421,12 @@ ; ; AVX2-LABEL: test_rem7_4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 Index: test/CodeGen/X86/vector-idiv-udiv-256.ll =================================================================== --- test/CodeGen/X86/vector-idiv-udiv-256.ll +++ test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -96,41 +96,37 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind { ; AVX1-LABEL: test_div7_8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_div7_8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 @@ -371,46 +367,42 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind { ; AVX1-LABEL: test_rem7_8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7] +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7] -; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_rem7_8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 Index: test/CodeGen/X86/vector-idiv-udiv-512.ll =================================================================== --- test/CodeGen/X86/vector-idiv-udiv-512.ll +++ test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -94,7 +94,6 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] ; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 ; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] @@ -324,7 +323,6 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] ; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 -; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 ; AVX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] Index: test/CodeGen/X86/vector-idiv.ll =================================================================== --- test/CodeGen/X86/vector-idiv.ll +++ test/CodeGen/X86/vector-idiv.ll @@ -24,24 +24,19 @@ ; SSE2-LABEL: PR20355: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: psubd %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrld $31, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: PR20355: @@ -71,13 +66,12 @@ ; ; AVX2-LABEL: PR20355: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1431655766,1431655766,1431655766,1431655766] +; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq Index: test/CodeGen/X86/vselect-avx.ll =================================================================== --- test/CodeGen/X86/vselect-avx.ll +++ test/CodeGen/X86/vselect-avx.ll @@ -106,13 +106,12 @@ ; ; AVX2-LABEL: test3: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] ; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4 ; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3]