diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -562,6 +562,10 @@ "PreferLowerShuffleAsShift", "true", "Shifts are faster (or as fast) as shuffle">; +def TuningFastImmVectorShift : SubtargetFeature<"tuning-fast-imm-vector-shift", + "FastImmVectorShift", "true", + "Vector shifts are fast (2/cycle) as opposed to slow (1/cycle)">; + // On some X86 processors, a vzeroupper instruction should be inserted after // using ymm/zmm registers before executing code that may use SSE instructions. def TuningInsertVZEROUPPER @@ -939,7 +943,8 @@ TuningPreferShiftShuffle, TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, - TuningNoDomainDelayBlend]; + TuningNoDomainDelayBlend, + TuningFastImmVectorShift]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -980,7 +985,8 @@ TuningAllowLight256Bit, TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, - TuningNoDomainDelayBlend]; + TuningNoDomainDelayBlend, + TuningFastImmVectorShift]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -1008,7 +1014,8 @@ TuningAllowLight256Bit, TuningNoDomainDelayMov, TuningNoDomainDelayShuffle, - TuningNoDomainDelayBlend]; + TuningNoDomainDelayBlend, + TuningFastImmVectorShift]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -1170,7 +1177,8 @@ FeatureMOVDIR64B, FeatureWAITPKG]; list ADLAdditionalTuning = [TuningPERMFalseDeps, - TuningPreferMovmskOverVTest]; + TuningPreferMovmskOverVTest, + TuningFastImmVectorShift]; list ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); list ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48722,12 +48722,25 @@ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - if (VT != MVT::i64 && VT != MVT::i32) + if (VT != MVT::i64 && VT != MVT::i32 && + (!VT.isVector() || !VT.isSimple() || !VT.isInteger())) return SDValue(); - ConstantSDNode *C = dyn_cast(N->getOperand(1)); - if (!C) - return SDValue(); + ConstantSDNode *CNode = isConstOrConstSplat( + N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false); + const APInt *C = nullptr; + if (!CNode) { + if (VT.isVector()) + if (auto *RawC = getTargetConstantFromNode(N->getOperand(1))) + if (auto *SplatC = RawC->getSplatValue()) + C = &(SplatC->getUniqueInteger()); + + if (!C) + return SDValue(); + } else { + C = &(CNode->getAPIntValue()); + } + if (isPowerOf2_64(C->getZExtValue())) return SDValue(); @@ -48736,68 +48749,69 @@ uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; SDLoc DL(N); - if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) { - SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(AbsMulAmt, DL, VT)); - if (SignMulAmt < 0) - NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - NewMul); - - return NewMul; - } - - uint64_t MulAmt1 = 0; - uint64_t MulAmt2 = 0; - if ((AbsMulAmt % 9) == 0) { - MulAmt1 = 9; - MulAmt2 = AbsMulAmt / 9; - } else if ((AbsMulAmt % 5) == 0) { - MulAmt1 = 5; - MulAmt2 = AbsMulAmt / 5; - } else if ((AbsMulAmt % 3) == 0) { - MulAmt1 = 3; - MulAmt2 = AbsMulAmt / 3; - } - - SDValue NewMul; - // For negative multiply amounts, only allow MulAmt2 to be a power of 2. - if (MulAmt2 && - (isPowerOf2_64(MulAmt2) || - (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) { - - if (isPowerOf2_64(MulAmt2) && - !(SignMulAmt >= 0 && N->hasOneUse() && - N->use_begin()->getOpcode() == ISD::ADD)) - // If second multiplifer is pow2, issue it first. We want the multiply by - // 3, 5, or 9 to be folded into the addressing mode unless the lone use - // is an add. Only do this for positive multiply amounts since the - // negate would prevent it from being used as an address mode anyway. - std::swap(MulAmt1, MulAmt2); - - if (isPowerOf2_64(MulAmt1)) - NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); - else + SDValue NewMul = SDValue(); + if (VT == MVT::i64 || VT == MVT::i32) { + if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) { NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(MulAmt1, DL, VT)); - - if (isPowerOf2_64(MulAmt2)) - NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, - DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); - else - NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, - DAG.getConstant(MulAmt2, DL, VT)); + DAG.getConstant(AbsMulAmt, DL, VT)); + if (SignMulAmt < 0) + NewMul = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); + + return NewMul; + } + + uint64_t MulAmt1 = 0; + uint64_t MulAmt2 = 0; + if ((AbsMulAmt % 9) == 0) { + MulAmt1 = 9; + MulAmt2 = AbsMulAmt / 9; + } else if ((AbsMulAmt % 5) == 0) { + MulAmt1 = 5; + MulAmt2 = AbsMulAmt / 5; + } else if ((AbsMulAmt % 3) == 0) { + MulAmt1 = 3; + MulAmt2 = AbsMulAmt / 3; + } + + // For negative multiply amounts, only allow MulAmt2 to be a power of 2. + if (MulAmt2 && + (isPowerOf2_64(MulAmt2) || + (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) { + + if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() && + N->use_begin()->getOpcode() == ISD::ADD)) + // If second multiplifer is pow2, issue it first. We want the multiply + // by 3, 5, or 9 to be folded into the addressing mode unless the lone + // use is an add. Only do this for positive multiply amounts since the + // negate would prevent it from being used as an address mode anyway. + std::swap(MulAmt1, MulAmt2); + + if (isPowerOf2_64(MulAmt1)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(MulAmt1, DL, VT)); - // Negate the result. - if (SignMulAmt < 0) - NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - NewMul); - } else if (!Subtarget.slowLEA()) - NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL); + if (isPowerOf2_64(MulAmt2)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, + DAG.getConstant(MulAmt2, DL, VT)); + // Negate the result. + if (SignMulAmt < 0) + NewMul = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); + } else if (!Subtarget.slowLEA()) + NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL); + } if (!NewMul) { + EVT ShiftVT = VT.isVector() ? VT : MVT::i8; assert(C->getZExtValue() != 0 && - C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && + C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) && "Both cases that could cause potential overflows should have " "already been handled."); if (isPowerOf2_64(AbsMulAmt - 1)) { @@ -48805,38 +48819,61 @@ NewMul = DAG.getNode( ISD::ADD, DL, VT, N->getOperand(0), DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, - MVT::i8))); + DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT))); // To negate, subtract the number from zero if (SignMulAmt < 0) - NewMul = DAG.getNode(ISD::SUB, DL, VT, - DAG.getConstant(0, DL, VT), NewMul); + NewMul = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); } else if (isPowerOf2_64(AbsMulAmt + 1)) { // (mul x, 2^N - 1) => (sub (shl x, N), x) - NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(AbsMulAmt + 1), - DL, MVT::i8)); + NewMul = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT)); // To negate, reverse the operands of the subtract. if (SignMulAmt < 0) NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul); else NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0)); - } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) { + } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) && + (!VT.isVector() || Subtarget.fastImmVectorShift())) { // (mul x, 2^N + 2) => (add (shl x, N), (add x, x)) - NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(AbsMulAmt - 2), - DL, MVT::i8)); + NewMul = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT)); NewMul = DAG.getNode( ISD::ADD, DL, VT, NewMul, DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0))); - } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) { + } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) && + (!VT.isVector() || Subtarget.fastImmVectorShift())) { // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x)) - NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(AbsMulAmt + 2), - DL, MVT::i8)); + NewMul = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT)); NewMul = DAG.getNode( ISD::SUB, DL, VT, NewMul, DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0))); + } else if (SignMulAmt >= 0 && VT.isVector() && + Subtarget.fastImmVectorShift()) { + uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt); + uint64_t ShiftAmt1; + std::optional Opc; + if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) { + ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit; + Opc = ISD::ADD; + } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) { + ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit; + Opc = ISD::SUB; + } + + if (Opc) { + SDValue Shift1 = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT)); + SDValue Shift2 = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT)); + NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2); + } } } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -93,6 +93,7 @@ X86::TuningNoDomainDelayShuffle, X86::TuningNoDomainDelayBlend, X86::TuningPreferShiftShuffle, + X86::TuningFastImmVectorShift, // Perf-tuning flags. X86::TuningFastGather, diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -210,12 +210,13 @@ ; CHECK-LABEL: bcast_unfold_mul_v16i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB6_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB6_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -244,12 +245,13 @@ ; CHECK-LABEL: bcast_unfold_mul_v8i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB7_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %ymm0, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB7_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -278,12 +280,13 @@ ; CHECK-LABEL: bcast_unfold_mul_v4i32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB8_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %xmm0, %xmm1 -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB8_1 ; CHECK-NEXT: # %bb.2: # %bb10 diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -234,13 +234,16 @@ ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psubd %xmm1, %xmm3 ; SSE-NEXT: psubd %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3] -; SSE-NEXT: movdqu %xmm2, (%rsi) -; SSE-NEXT: pmulld %xmm0, %xmm2 -; SSE-NEXT: pmulld %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: movdqu %xmm3, 16(%rsi) -; SSE-NEXT: movdqu %xmm0, 16(%rdi) -; SSE-NEXT: movdqu %xmm2, (%rdi) +; SSE-NEXT: movdqu %xmm2, (%rsi) +; SSE-NEXT: movdqu %xmm1, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: PR52039: diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -103,14 +103,16 @@ define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { ; X86-LABEL: vrolw_extract_mul_with_mask: ; X86: # %bb.0: -; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-NEXT: vpslld $3, %xmm0, %xmm1 +; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vprold $7, %xmm0, %xmm0 ; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vrolw_extract_mul_with_mask: ; X64: # %bb.0: -; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-NEXT: vpslld $3, %xmm0, %xmm1 +; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vprold $7, %xmm0, %xmm0 ; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-NEXT: retq @@ -194,7 +196,8 @@ ; X86-LABEL: no_extract_mul: ; X86: # %bb.0: ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 -; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-NEXT: vpslld $3, %ymm0, %ymm2 +; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpsrld $23, %ymm0, %ymm0 ; X86-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl @@ -202,7 +205,8 @@ ; X64-LABEL: no_extract_mul: ; X64: # %bb.0: ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-NEXT: vpslld $3, %ymm0, %ymm2 +; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpsrld $23, %ymm0, %ymm0 ; X64-NEXT: vpor %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -487,8 +487,10 @@ ; SSE41-NEXT: psrld $31, %xmm1 ; SSE41-NEXT: psrad $2, %xmm2 ; SSE41-NEXT: paddd %xmm1, %xmm2 -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: psubd %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pslld $3, %xmm1 +; SSE41-NEXT: psubd %xmm1, %xmm2 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_rem7_4i32: @@ -503,8 +505,9 @@ ; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $3, %xmm1, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_rem7_4i32: @@ -519,9 +522,9 @@ ; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpslld $3, %xmm1, %xmm2 +; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %res = srem <4 x i32> %a, ret <4 x i32> %res @@ -536,8 +539,10 @@ ; SSE-NEXT: psrlw $15, %xmm2 ; SSE-NEXT: psraw $1, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllw $3, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm1 +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_rem7_8i16: @@ -546,8 +551,9 @@ ; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 ; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $3, %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %res = srem <8 x i16> %a, ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -497,9 +497,9 @@ ; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2 ; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpslld $3, %ymm1, %ymm2 +; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = srem <8 x i32> %a, ret <8 x i32> %res @@ -533,8 +533,9 @@ ; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2 ; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2 +; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = srem <16 x i16> %a, ret <16 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -400,8 +400,9 @@ ; AVX-NEXT: vpsrld $31, %zmm1, %zmm2 ; AVX-NEXT: vpsrad $2, %zmm1, %zmm1 ; AVX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX-NEXT: vpslld $3, %zmm1, %zmm2 +; AVX-NEXT: vpsubd %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = srem <16 x i32> %a, ret <16 x i32> %res @@ -435,8 +436,9 @@ ; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2 ; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = srem <32 x i16> %a, ret <32 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -558,8 +558,10 @@ ; SSE41-NEXT: psrld $1, %xmm1 ; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: psrld $2, %xmm1 -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pslld $3, %xmm2 +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_rem7_4i32: @@ -574,8 +576,9 @@ ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $3, %xmm1, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_rem7_4i32: @@ -590,9 +593,9 @@ ; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpslld $3, %xmm1, %xmm2 +; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %res = urem <4 x i32> %a, ret <4 x i32> %res @@ -608,8 +611,10 @@ ; SSE-NEXT: psrlw $1, %xmm2 ; SSE-NEXT: paddw %xmm1, %xmm2 ; SSE-NEXT: psrlw $2, %xmm2 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psllw $3, %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm2 +; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_rem7_8i16: @@ -619,8 +624,9 @@ ; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $3, %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %res = urem <8 x i16> %a, ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -528,9 +528,9 @@ ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpslld $3, %ymm1, %ymm2 +; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = urem <8 x i32> %a, ret <8 x i32> %res @@ -567,8 +567,9 @@ ; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2 +; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %res = urem <16 x i16> %a, ret <16 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -420,8 +420,9 @@ ; AVX-NEXT: vpsrld $1, %zmm1, %zmm1 ; AVX-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX-NEXT: vpsrld $2, %zmm1, %zmm1 -; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX-NEXT: vpslld $3, %zmm1, %zmm2 +; AVX-NEXT: vpsubd %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = urem <16 x i32> %a, ret <16 x i32> %res @@ -458,8 +459,9 @@ ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = urem <32 x i16> %a, ret <32 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -322,33 +322,17 @@ } define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind { -; SSE2-LABEL: mul_v4i32_17: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $4, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} -; -; X86-SSE4-LABEL: mul_v4i32_17: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE4-FAST-LABEL: mul_v4i32_17: -; X64-SSE4-FAST: # %bb.0: -; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-FAST-NEXT: retq -; -; X64-SSE4-SLOW-LABEL: mul_v4i32_17: -; X64-SSE4-SLOW: # %bb.0: -; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE4-SLOW-NEXT: pslld $4, %xmm1 -; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm0 -; X64-SSE4-SLOW-NEXT: retq +; SSE-LABEL: mul_v4i32_17: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslld $4, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v4i32_17: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: vpslld $4, %xmm0, %xmm1 +; X64-XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v4i32_17: @@ -366,19 +350,17 @@ } define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind { -; X86-SSE-LABEL: mul_v8i16_17: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: mul_v8i16_17: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: retq +; SSE-LABEL: mul_v8i16_17: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psllw $4, %xmm1 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-AVX-LABEL: mul_v8i16_17: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpsllw $4, %xmm0, %xmm1 +; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 @@ -461,39 +443,15 @@ } define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind { -; SSE2-LABEL: mul_v8i32_17: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pslld $4, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pslld $4, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: ret{{[l|q]}} -; -; X86-SSE4-LABEL: mul_v8i32_17: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] -; X86-SSE4-NEXT: pmulld %xmm2, %xmm0 -; X86-SSE4-NEXT: pmulld %xmm2, %xmm1 -; X86-SSE4-NEXT: retl -; -; X64-SSE4-FAST-LABEL: mul_v8i32_17: -; X64-SSE4-FAST: # %bb.0: -; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] -; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0 -; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1 -; X64-SSE4-FAST-NEXT: retq -; -; X64-SSE4-SLOW-LABEL: mul_v8i32_17: -; X64-SSE4-SLOW: # %bb.0: -; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2 -; X64-SSE4-SLOW-NEXT: paddd %xmm2, %xmm0 -; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2 -; X64-SSE4-SLOW-NEXT: paddd %xmm2, %xmm1 -; X64-SSE4-SLOW-NEXT: retq +; SSE-LABEL: mul_v8i32_17: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pslld $4, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pslld $4, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v8i32_17: ; X64-XOP: # %bb.0: @@ -522,9 +480,12 @@ define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind { ; SSE-LABEL: mul_v16i16_17: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] -; SSE-NEXT: pmullw %xmm2, %xmm0 -; SSE-NEXT: pmullw %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psllw $4, %xmm2 +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psllw $4, %xmm2 +; SSE-NEXT: paddw %xmm2, %xmm1 ; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v16i16_17: @@ -539,12 +500,14 @@ ; ; X64-AVX2-LABEL: mul_v16i16_17: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 +; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i16_17: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1 +; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, ret <16 x i16> %1 @@ -618,37 +581,21 @@ } define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind { -; SSE2-LABEL: mul_v4i32_neg33: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $5, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} -; -; X86-SSE4-LABEL: mul_v4i32_neg33: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE4-FAST-LABEL: mul_v4i32_neg33: -; X64-SSE4-FAST: # %bb.0: -; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-FAST-NEXT: retq -; -; X64-SSE4-SLOW-LABEL: mul_v4i32_neg33: -; X64-SSE4-SLOW: # %bb.0: -; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE4-SLOW-NEXT: pslld $5, %xmm1 -; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm1 -; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0 -; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0 -; X64-SSE4-SLOW-NEXT: retq +; SSE-LABEL: mul_v4i32_neg33: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslld $5, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v4i32_neg33: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: vpslld $5, %xmm0, %xmm1 +; X64-XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v4i32_neg33: @@ -666,19 +613,21 @@ } define <8 x i16> @mul_v8i16_neg9(<8 x i16> %a0) nounwind { -; X86-SSE-LABEL: mul_v8i16_neg9: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: mul_v8i16_neg9: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: retq +; SSE-LABEL: mul_v8i16_neg9: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psllw $3, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-AVX-LABEL: mul_v8i16_neg9: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1 +; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 @@ -783,49 +732,20 @@ } define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind { -; SSE2-LABEL: mul_v8i32_neg33: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pslld $5, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pslld $5, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: ret{{[l|q]}} -; -; X86-SSE4-LABEL: mul_v8i32_neg33: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] -; X86-SSE4-NEXT: pmulld %xmm2, %xmm0 -; X86-SSE4-NEXT: pmulld %xmm2, %xmm1 -; X86-SSE4-NEXT: retl -; -; X64-SSE4-FAST-LABEL: mul_v8i32_neg33: -; X64-SSE4-FAST: # %bb.0: -; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] -; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0 -; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1 -; X64-SSE4-FAST-NEXT: retq -; -; X64-SSE4-SLOW-LABEL: mul_v8i32_neg33: -; X64-SSE4-SLOW: # %bb.0: -; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3 -; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm3 -; X64-SSE4-SLOW-NEXT: pxor %xmm2, %xmm2 -; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0 -; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm0 -; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm3 -; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3 -; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm3 -; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm2 -; X64-SSE4-SLOW-NEXT: movdqa %xmm2, %xmm1 -; X64-SSE4-SLOW-NEXT: retq +; SSE-LABEL: mul_v8i32_neg33: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pslld $5, %xmm3 +; SSE-NEXT: paddd %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psubd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pslld $5, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: psubd %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v8i32_neg33: ; X64-XOP: # %bb.0: @@ -857,9 +777,17 @@ define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind { ; SSE-LABEL: mul_v16i16_neg9: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527] -; SSE-NEXT: pmullw %xmm2, %xmm0 -; SSE-NEXT: pmullw %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psllw $3, %xmm3 +; SSE-NEXT: paddw %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psubw %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psllw $3, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: psubw %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v16i16_neg9: @@ -877,12 +805,18 @@ ; ; X64-AVX2-LABEL: mul_v16i16_neg9: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 +; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i16_neg9: ; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1 +; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX512DQ-NEXT: vpsubw %ymm0, %ymm1, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = mul <16 x i16> %a0, ret <16 x i16> %1 @@ -1162,35 +1096,18 @@ } define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind { -; SSE2-LABEL: mul_v4i32_7: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $3, %xmm1 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} -; -; X86-SSE4-LABEL: mul_v4i32_7: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE4-FAST-LABEL: mul_v4i32_7: -; X64-SSE4-FAST: # %bb.0: -; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-FAST-NEXT: retq -; -; X64-SSE4-SLOW-LABEL: mul_v4i32_7: -; X64-SSE4-SLOW: # %bb.0: -; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE4-SLOW-NEXT: pslld $3, %xmm1 -; X64-SSE4-SLOW-NEXT: psubd %xmm0, %xmm1 -; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE4-SLOW-NEXT: retq +; SSE-LABEL: mul_v4i32_7: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslld $3, %xmm1 +; SSE-NEXT: psubd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v4i32_7: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: vpslld $3, %xmm0, %xmm1 +; X64-XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v4i32_7: @@ -1208,19 +1125,18 @@ } define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind { -; X86-SSE-LABEL: mul_v8i16_7: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: mul_v8i16_7: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: retq +; SSE-LABEL: mul_v8i16_7: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psllw $3, %xmm1 +; SSE-NEXT: psubw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-AVX-LABEL: mul_v8i16_7: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1 +; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 @@ -1290,33 +1206,17 @@ } define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind { -; SSE2-LABEL: mul_v4i32_neg63: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $6, %xmm1 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} -; -; X86-SSE4-LABEL: mul_v4i32_neg63: -; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: retl -; -; X64-SSE4-FAST-LABEL: mul_v4i32_neg63: -; X64-SSE4-FAST: # %bb.0: -; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-FAST-NEXT: retq -; -; X64-SSE4-SLOW-LABEL: mul_v4i32_neg63: -; X64-SSE4-SLOW: # %bb.0: -; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE4-SLOW-NEXT: pslld $6, %xmm1 -; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0 -; X64-SSE4-SLOW-NEXT: retq +; SSE-LABEL: mul_v4i32_neg63: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pslld $6, %xmm1 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-XOP-LABEL: mul_v4i32_neg63: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: vpslld $6, %xmm0, %xmm1 +; X64-XOP-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v4i32_neg63: @@ -1334,19 +1234,17 @@ } define <8 x i16> @mul_v8i16_neg31(<8 x i16> %a0) nounwind { -; X86-SSE-LABEL: mul_v8i16_neg31: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: mul_v8i16_neg31: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: retq +; SSE-LABEL: mul_v8i16_neg31: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psllw $5, %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} ; ; X64-AVX-LABEL: mul_v8i16_neg31: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpsllw $5, %xmm0, %xmm1 +; X64-AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: retq %1 = mul <8 x i16> %a0, ret <8 x i16> %1 @@ -2090,3 +1988,6 @@ %e = mul <4 x i64> %b, %d ret <4 x i64> %e } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-SSE4-FAST: {{.*}} +; X64-SSE4-SLOW: {{.*}}