Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -532,6 +532,12 @@ return false; } + // Return true if target can efficiently lower a given vector shift with a + // non-splat vector shift amount. + virtual bool isSupportedVectorVarShift(EVT VT, unsigned Opcode) const { + return false; + } + /// Return the ValueType of the result of SETCC operations. virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const; Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2899,18 +2899,75 @@ if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1); + // Helper for determining whether a value is a power-2 constant scalar or a + // vector of such elements. + auto IsPowerOfTwo = [](SDValue V, SmallBitVector &KnownNegatives) { + // Scalar and splat-vector cases + auto *C = isConstOrConstSplat(V); + if (C && !C->isNullValue() && !C->isOpaque()) { + KnownNegatives.resize(1, false); + if (C->getAPIntValue().isPowerOf2()) { + return true; + } else if ((-C->getAPIntValue()).isPowerOf2()) { + KnownNegatives.set(); + return true; + } + } + // Non-splat Vector case + if (V.getOpcode() != ISD::BUILD_VECTOR) + return false; + KnownNegatives.resize(V.getNumOperands(), false); + for (int i = 0, e = V.getNumOperands(); i != e; ++i) { + auto *COp = dyn_cast(V.getOperand(i)); + if (!COp || COp->isOpaque()) + return false; + if (COp->getAPIntValue().isPowerOf2()) + continue; + else if ((-(COp->getAPIntValue())).isPowerOf2()) + KnownNegatives.set(i); + else + return false; + } + return true; + }; // fold (sdiv X, pow2) -> simple ops after legalize // FIXME: We check for the exact bit here because the generic lowering gives // better results in that case. The target-specific lowering should learn how // to handle exact sdivs efficiently. - if (N1C && !N1C->isNullValue() && !N1C->isOpaque() && - !N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() || - (-N1C->getAPIntValue()).isPowerOf2())) { + // For non-splat vector shift amounts, it is profitable if the target can + // lower efficiently vector shifts with vector variable shift amounts. + SmallBitVector KnownNegatives; + if (!N->getFlags().hasExact() && + (N1C || (TLI.isSupportedVectorVarShift(VT, ISD::SRA) && + TLI.isSupportedVectorVarShift(VT, ISD::SRL))) && + IsPowerOfTwo(N1, KnownNegatives)) { // Target-specific implementation of sdiv x, pow2. if (SDValue Res = BuildSDIVPow2(N)) return Res; - unsigned lg2 = N1C->getAPIntValue().countTrailingZeros(); + // Create constants that are functions of the shift amount value. + SDValue C0, C1; + if (N1C) { + // Scalar or splat-vector shift amount case. + unsigned lg2 = N1C->getAPIntValue().countTrailingZeros(); + C0 = DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL, + getShiftAmountTy(N0.getValueType())); + C1 = DAG.getConstant(lg2, DL, getShiftAmountTy(N0.getValueType())); + } else { + // Non-splat vector shift amount case. + SmallVector Ops0, Ops1; + for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + unsigned lg2 = cast(N1->getOperand(i)) + ->getAPIntValue() + .countTrailingZeros(); + Ops0.push_back(DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL, + VT.getScalarType())); + Ops1.push_back(DAG.getConstant(lg2, DL, VT.getScalarType())); + } + + C0 = DAG.getBuildVector(VT, DL, Ops0); + C1 = DAG.getBuildVector(VT, DL, Ops1); + } // Splat the sign bit into the register SDValue SGN = @@ -2920,24 +2977,38 @@ AddToWorklist(SGN.getNode()); // Add (N0 < 0) ? abs2 - 1 : 0; - SDValue SRL = - DAG.getNode(ISD::SRL, DL, VT, SGN, - DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL, - getShiftAmountTy(SGN.getValueType()))); + SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, SGN, C0); SDValue ADD = DAG.getNode(ISD::ADD, DL, VT, N0, SRL); AddToWorklist(SRL.getNode()); - AddToWorklist(ADD.getNode()); // Divide by pow2 - SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, ADD, - DAG.getConstant(lg2, DL, - getShiftAmountTy(ADD.getValueType()))); + AddToWorklist(ADD.getNode()); // Divide by pow2 + SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, ADD, C1); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. - if (N1C->getAPIntValue().isNonNegative()) + if (KnownNegatives.none()) return SRA; AddToWorklist(SRA.getNode()); - return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); + SDValue SUB = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); + // If all shift amount elements are negative, we're done. + if (KnownNegatives.all()) + return SUB; + + // Shift amount has both positive and negative elements. + assert(VT.isVector() && !N0C && + "Expecting a non-splat vector shift amount"); + + SmallVector VSELECTMask; + for (int i = 0, e = VT.getVectorNumElements(); i < e; ++i) + VSELECTMask.push_back( + DAG.getConstant(KnownNegatives[i] ? -1 : 0, DL, MVT::i1)); + + SDValue MASK = + DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorElementCount()), + DL, VSELECTMask); + return DAG.getNode(ISD::VSELECT, DL, VT, MASK, SUB, SRA); } // If integer divide is expensive and we satisfy the requirements, emit an Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -835,6 +835,8 @@ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; + bool isSupportedVectorVarShift(EVT VT, unsigned Opcode) const override; + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const override; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -22734,8 +22734,9 @@ // Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget -static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, - unsigned Opcode) { +bool X86TargetLowering::isSupportedVectorVarShift(EVT VT, unsigned Opcode) const { + if (!VT.isSimple()) + return false; if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) return false; @@ -23040,7 +23041,8 @@ if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; - if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) + if (DAG.getTargetLoweringInfo().isSupportedVectorVarShift(Op.getValueType(), + Op.getOpcode())) return Op; // XOP has 128-bit variable logical/arithmetic shifts. Index: test/CodeGen/X86/combine-sdiv.ll =================================================================== --- test/CodeGen/X86/combine-sdiv.ll +++ test/CodeGen/X86/combine-sdiv.ll @@ -703,60 +703,10 @@ ; ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; AVX512BW-NEXT: vpextrw $1, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $14, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $2, %cx -; AVX512BW-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $2, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $3, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $12, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $4, %cx -; AVX512BW-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $13, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $3, %cx -; AVX512BW-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $5, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $11, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $5, %cx -; AVX512BW-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $6, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $10, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $6, %cx -; AVX512BW-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $7, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0 +; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: retq %1 = sdiv <8 x i16> %x, ret <8 x i16> %1 @@ -1218,115 +1168,10 @@ ; ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm3 = xmm1[0],xmm2[1,2,3] -; AVX512BW-NEXT: vpextrw $1, %xmm1, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $14, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $2, %cx -; AVX512BW-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $2, %xmm1, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $3, %xmm1, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $12, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $4, %cx -; AVX512BW-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $4, %xmm1, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $13, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $3, %cx -; AVX512BW-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $5, %xmm1, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $11, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $5, %cx -; AVX512BW-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $6, %xmm1, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $10, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $6, %cx -; AVX512BW-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $7, %xmm1, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm1 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; AVX512BW-NEXT: vpextrw $1, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $14, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $2, %cx -; AVX512BW-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrw $2, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrw $3, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $12, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $4, %cx -; AVX512BW-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $13, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $3, %cx -; AVX512BW-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrw $5, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $11, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $5, %cx -; AVX512BW-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrw $6, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $10, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $6, %cx -; AVX512BW-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; AVX512BW-NEXT: vpextrw $7, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1 +; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: retq %1 = sdiv <16 x i16> %x, ret <16 x i16> %1 @@ -2222,225 +2067,10 @@ ; ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3] -; AVX512BW-NEXT: vpextrw $1, %xmm2, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $14, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $2, %cx -; AVX512BW-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $2, %xmm2, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $3, %xmm2, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $12, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $4, %cx -; AVX512BW-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $13, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $3, %cx -; AVX512BW-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $5, %xmm2, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $11, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $5, %cx -; AVX512BW-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $6, %xmm2, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $10, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $6, %cx -; AVX512BW-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm2 -; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm4 = xmm3[0],xmm1[1,2,3] -; AVX512BW-NEXT: vpextrw $1, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $14, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $2, %cx -; AVX512BW-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $2, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $2, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $3, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $12, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $4, %cx -; AVX512BW-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $4, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $13, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $3, %cx -; AVX512BW-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $5, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $11, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $5, %cx -; AVX512BW-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $6, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $10, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $6, %cx -; AVX512BW-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $7, %ecx, %xmm4, %xmm3 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm4 = xmm3[0],xmm1[1,2,3] -; AVX512BW-NEXT: vpextrw $1, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $14, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $2, %cx -; AVX512BW-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $2, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $2, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $3, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $12, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $4, %cx -; AVX512BW-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $4, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $13, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $3, %cx -; AVX512BW-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $5, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $11, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $5, %cx -; AVX512BW-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $6, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $10, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $6, %cx -; AVX512BW-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $7, %ecx, %xmm4, %xmm3 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; AVX512BW-NEXT: vpextrw $1, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $14, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $2, %cx -; AVX512BW-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $2, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $3, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $12, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $4, %cx -; AVX512BW-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $13, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $3, %cx -; AVX512BW-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $5, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $11, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $5, %cx -; AVX512BW-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $6, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: sarw $15, %cx -; AVX512BW-NEXT: movzwl %cx, %ecx -; AVX512BW-NEXT: shrl $10, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw $6, %cx -; AVX512BW-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $7, %xmm0, %eax -; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $15, %ecx -; AVX512BW-NEXT: addl %eax, %ecx -; AVX512BW-NEXT: sarw %cx -; AVX512BW-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq %1 = sdiv <32 x i16> %x, ret <32 x i16> %1 @@ -2472,30 +2102,38 @@ ; SSE-NEXT: pinsrd $3, %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_sdiv_by_pow2b_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $30, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $29, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $3, %ecx -; AVX-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $28, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $4, %ecx -; AVX-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $30, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $2, %ecx +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $29, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $3, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $28, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $4, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2ORLATER-NEXT: retq %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 } @@ -2597,50 +2235,10 @@ ; ; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32: ; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2ORLATER-NEXT: vpextrd $1, %xmm1, %eax -; AVX2ORLATER-NEXT: movl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $31, %ecx -; AVX2ORLATER-NEXT: shrl $30, %ecx -; AVX2ORLATER-NEXT: addl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $2, %ecx -; AVX2ORLATER-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm2 -; AVX2ORLATER-NEXT: vpextrd $2, %xmm1, %eax -; AVX2ORLATER-NEXT: movl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $31, %ecx -; AVX2ORLATER-NEXT: shrl $29, %ecx -; AVX2ORLATER-NEXT: addl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $3, %ecx -; AVX2ORLATER-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX2ORLATER-NEXT: vpextrd $3, %xmm1, %eax -; AVX2ORLATER-NEXT: movl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $31, %ecx -; AVX2ORLATER-NEXT: shrl $28, %ecx -; AVX2ORLATER-NEXT: addl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $4, %ecx -; AVX2ORLATER-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 -; AVX2ORLATER-NEXT: vpextrd $1, %xmm0, %eax -; AVX2ORLATER-NEXT: movl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $31, %ecx -; AVX2ORLATER-NEXT: shrl $30, %ecx -; AVX2ORLATER-NEXT: addl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $2, %ecx -; AVX2ORLATER-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm2 -; AVX2ORLATER-NEXT: vpextrd $2, %xmm0, %eax -; AVX2ORLATER-NEXT: movl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $31, %ecx -; AVX2ORLATER-NEXT: shrl $29, %ecx -; AVX2ORLATER-NEXT: addl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $3, %ecx -; AVX2ORLATER-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX2ORLATER-NEXT: vpextrd $3, %xmm0, %eax -; AVX2ORLATER-NEXT: movl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $31, %ecx -; AVX2ORLATER-NEXT: shrl $28, %ecx -; AVX2ORLATER-NEXT: addl %eax, %ecx -; AVX2ORLATER-NEXT: sarl $4, %ecx -; AVX2ORLATER-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 -; AVX2ORLATER-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1 +; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 +; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ; AVX2ORLATER-NEXT: retq %1 = sdiv <8 x i32> %x, ret <8 x i32> %1 @@ -2829,188 +2427,26 @@ ; ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpextrd $1, %xmm2, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $30, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $2, %ecx -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm3 -; AVX2-NEXT: vpextrd $2, %xmm2, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $29, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $3, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $3, %xmm2, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $28, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $4, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm2 -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $30, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $2, %ecx -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm3 -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $29, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $3, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $28, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $4, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpextrd $1, %xmm2, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $30, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $2, %ecx -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm3 -; AVX2-NEXT: vpextrd $2, %xmm2, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $29, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $3, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $3, %xmm2, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $28, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $4, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm2 -; AVX2-NEXT: vpextrd $1, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $30, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $2, %ecx -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm3 -; AVX2-NEXT: vpextrd $2, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $29, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $3, %ecx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX2-NEXT: vpextrd $3, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: sarl $31, %ecx -; AVX2-NEXT: shrl $28, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: sarl $4, %ecx -; AVX2-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [32,30,29,28,32,30,29,28] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,3,4,0,2,3,4] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpsravd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm4 +; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsravd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: combine_vec_sdiv_by_pow2b_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vpextrd $1, %xmm1, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $30, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $2, %ecx -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm2 -; AVX512-NEXT: vpextrd $2, %xmm1, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $29, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $3, %ecx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; AVX512-NEXT: vpextrd $3, %xmm1, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $28, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $4, %ecx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512-NEXT: vpextrd $1, %xmm2, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $30, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $2, %ecx -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm3 -; AVX512-NEXT: vpextrd $2, %xmm2, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $29, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $3, %ecx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrd $3, %xmm2, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $28, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $4, %ecx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpextrd $1, %xmm2, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $30, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $2, %ecx -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm3 -; AVX512-NEXT: vpextrd $2, %xmm2, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $29, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $3, %ecx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrd $3, %xmm2, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $28, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $4, %ecx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm2 -; AVX512-NEXT: vpextrd $1, %xmm0, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $30, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $2, %ecx -; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm3 -; AVX512-NEXT: vpextrd $2, %xmm0, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $29, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $3, %ecx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 -; AVX512-NEXT: vpextrd $3, %xmm0, %eax -; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: sarl $31, %ecx -; AVX512-NEXT: shrl $28, %ecx -; AVX512-NEXT: addl %eax, %ecx -; AVX512-NEXT: sarl $4, %ecx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrad $31, %zmm0, %zmm1 +; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = sdiv <16 x i32> %x, ret <16 x i32> %1 @@ -3029,17 +2465,54 @@ ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_sdiv_by_pow2b_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: sarq $63, %rcx -; AVX-NEXT: shrq $62, %rcx -; AVX-NEXT: addq %rax, %rcx -; AVX-NEXT: sarq $2, %rcx -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: shrq $62, %rcx +; AVX1-NEXT: addq %rax, %rcx +; AVX1-NEXT: sarq $2, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: shrq $62, %rcx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: sarq $2, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; AVX512F-NEXT: movl $2, %eax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 +; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: movl $2, %eax +; AVX512BW-NEXT: vmovq %rax, %xmm1 +; AVX512BW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpsravq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %1 = sdiv <2 x i64> %x, ret <2 x i64> %1 } @@ -3101,34 +2574,53 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i64: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2ORLATER-NEXT: vpextrq $1, %xmm1, %rax -; AVX2ORLATER-NEXT: movq %rax, %rcx -; AVX2ORLATER-NEXT: sarq $63, %rcx -; AVX2ORLATER-NEXT: shrq $60, %rcx -; AVX2ORLATER-NEXT: addq %rax, %rcx -; AVX2ORLATER-NEXT: sarq $4, %rcx -; AVX2ORLATER-NEXT: vmovq %rcx, %xmm2 -; AVX2ORLATER-NEXT: vmovq %xmm1, %rax -; AVX2ORLATER-NEXT: movq %rax, %rcx -; AVX2ORLATER-NEXT: sarq $63, %rcx -; AVX2ORLATER-NEXT: shrq $61, %rcx -; AVX2ORLATER-NEXT: addq %rax, %rcx -; AVX2ORLATER-NEXT: sarq $3, %rcx -; AVX2ORLATER-NEXT: vmovq %rcx, %xmm1 -; AVX2ORLATER-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2ORLATER-NEXT: vpextrq $1, %xmm0, %rax -; AVX2ORLATER-NEXT: movq %rax, %rcx -; AVX2ORLATER-NEXT: sarq $63, %rcx -; AVX2ORLATER-NEXT: shrq $62, %rcx -; AVX2ORLATER-NEXT: addq %rax, %rcx -; AVX2ORLATER-NEXT: sarq $2, %rcx -; AVX2ORLATER-NEXT: vmovq %rcx, %xmm2 -; AVX2ORLATER-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2ORLATER-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2ORLATER-NEXT: retq +; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: shrq $60, %rcx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: sarq $4, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: shrq $61, %rcx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: sarq $3, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: shrq $62, %rcx +; AVX2-NEXT: addq %rax, %rcx +; AVX2-NEXT: sarq $2, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,4] +; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 +; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsravq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1 +; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq %1 = sdiv <4 x i64> %x, ret <4 x i64> %1 } @@ -3294,58 +2786,10 @@ ; ; AVX512-LABEL: combine_vec_sdiv_by_pow2b_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: sarq $63, %rcx -; AVX512-NEXT: shrq $60, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: sarq $4, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm2 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: sarq $63, %rcx -; AVX512-NEXT: shrq $61, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: sarq $3, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: sarq $63, %rcx -; AVX512-NEXT: shrq $62, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: sarq $2, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: sarq $63, %rcx -; AVX512-NEXT: shrq $60, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: sarq $4, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm3 -; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: sarq $63, %rcx -; AVX512-NEXT: shrq $61, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: sarq $3, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: sarq $63, %rcx -; AVX512-NEXT: shrq $62, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: sarq $2, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm3 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsraq $63, %zmm0, %zmm1 +; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsravq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = sdiv <8 x i64> %x, ret <8 x i64> %1 @@ -3379,32 +2823,43 @@ ; SSE-NEXT: pinsrd $3, %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: -; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $30, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: negl %ecx -; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $29, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $3, %ecx -; AVX-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: shrl $28, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: sarl $4, %ecx -; AVX-NEXT: negl %ecx -; AVX-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: +; AVX1: # %bb.0: +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $30, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $2, %ecx +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $29, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $3, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: sarl $31, %ecx +; AVX1-NEXT: shrl $28, %ecx +; AVX1-NEXT: addl %eax, %ecx +; AVX1-NEXT: sarl $4, %ecx +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX2ORLATER-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2ORLATER-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2ORLATER-NEXT: retq %1 = sdiv <4 x i32> %x, ret <4 x i32> %1 }