diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -56,6 +56,7 @@ X86MacroFusion.cpp X86OptimizeLEAs.cpp X86PadShortFunction.cpp + X86PartialReduction.cpp X86RegisterBankInfo.cpp X86RegisterInfo.cpp X86RetpolineThunks.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -133,6 +133,11 @@ /// fp exceptions when strict-fp enabled. FunctionPass *createX86InsertX87waitPass(); +/// This pass optimizes arithmetic based on knowledge that is only used by +/// a reduction sequence and is therefore safe to reassociate in interesting +/// ways. +FunctionPass *createX86PartialReductionPass(); + InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); @@ -154,6 +159,7 @@ void initializeX86ExpandPseudoPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); +void initializeX86PartialReductionPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); namespace X86AS { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45770,131 +45770,6 @@ DAG.getConstant(0, DL, VT), Cmp1.getValue(1)); } -static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - EVT VT = N->getValueType(0); - - // If the vector size is less than 128, or greater than the supported RegSize, - // do not use PMADD. - if (!VT.isVector() || VT.getVectorNumElements() < 8) - return SDValue(); - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - - auto UsePMADDWD = [&](SDValue Op) { - ShrinkMode Mode; - return Op.getOpcode() == ISD::MUL && - canReduceVMulWidth(Op.getNode(), DAG, Mode) && - Mode != ShrinkMode::MULU16 && - (!Subtarget.hasSSE41() || - (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - Op->isOnlyUserOf(Op.getOperand(1).getNode()))); - }; - - SDValue MulOp, OtherOp; - if (UsePMADDWD(Op0)) { - MulOp = Op0; - OtherOp = Op1; - } else if (UsePMADDWD(Op1)) { - MulOp = Op1; - OtherOp = Op0; - } else - return SDValue(); - - SDLoc DL(N); - EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - VT.getVectorNumElements()); - EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - VT.getVectorNumElements() / 2); - - // Shrink the operands of mul. - SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); - - // Madd vector size is half of the original vector size - auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef Ops) { - MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); - }; - SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, - PMADDWDBuilder); - // Fill the rest of the output with 0 - SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType()); - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); - - // Preserve the reduction flag on the ADD. We may need to revisit for the - // other operand. - SDNodeFlags Flags; - Flags.setVectorReduction(true); - return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags); -} - -static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - SDLoc DL(N); - EVT VT = N->getValueType(0); - - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. - if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) || - VT.getVectorElementType() != MVT::i32) - return SDValue(); - - // We know N is a reduction add. To match SAD, we need one of the operands to - // be an ABS. - SDValue AbsOp = N->getOperand(0); - SDValue OtherOp = N->getOperand(1); - if (AbsOp.getOpcode() != ISD::ABS) - std::swap(AbsOp, OtherOp); - if (AbsOp.getOpcode() != ISD::ABS) - return SDValue(); - - // Check whether we have an abs-diff pattern feeding into the select. - SDValue SadOp0, SadOp1; - if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1)) - return SDValue(); - - // SAD pattern detected. Now build a SAD instruction and an addition for - // reduction. Note that the number of elements of the result of SAD is less - // than the number of elements of its input. Therefore, we could only update - // part of elements in the reduction vector. - SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget); - - // The output of PSADBW is a vector of i64. - // We need to turn the vector of i64 into a vector of i32. - // If the reduction vector is at least as wide as the psadbw result, just - // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of - // the PSADBW will be zero. - MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); - Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); - - if (VT.getSizeInBits() > ResVT.getSizeInBits()) { - // Fill the upper elements with zero to match the add width. - assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"); - unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits(); - SmallVector Ops(NumConcats, DAG.getConstant(0, DL, ResVT)); - Ops[0] = Sad; - Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); - } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) { - Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad, - DAG.getIntPtrConstant(0, DL)); - } - - // Preserve the reduction flag on the ADD. We may need to revisit for the - // other operand. - SDNodeFlags Flags; - Flags.setVectorReduction(true); - return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags); -} - static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { @@ -45984,30 +45859,25 @@ Mode == ShrinkMode::MULU16) return SDValue(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + VT.getVectorNumElements() * 2); + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1)); + auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { - // Shrink by adding truncate nodes and let DAGCombine fold with the - // sources. EVT InVT = Ops[0].getValueType(); - assert(InVT.getScalarType() == MVT::i32 && - "Unexpected scalar element type"); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements() / 2); - EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - InVT.getVectorNumElements()); - return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, - DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]), - DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1])); + return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; - return SplitOpsAndApply(DAG, Subtarget, DL, VT, - { Mul.getOperand(0), Mul.getOperand(1) }, - PMADDBuilder); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder); } // Attempt to turn this pattern into PMADDWD. -// (mul (add (sext (build_vector)), (sext (build_vector))), -// (add (sext (build_vector)), (sext (build_vector))) +// (add (mul (sext (build_vector)), (sext (build_vector))), +// (mul (sext (build_vector)), (sext (build_vector))) static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { @@ -46129,13 +45999,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - const SDNodeFlags Flags = N->getFlags(); - if (Flags.hasVectorReduction()) { - if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) - return Sad; - if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) - return MAdd; - } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -0,0 +1,460 @@ +//===-- X86PartialReduction.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass looks for add instructions used by a horizontal reduction to see +// if we might be able to use pmaddwd or psadbw. Some cases of this require +// cross basic block knowledge and can't be done in SelectionDAG. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "X86TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-partial-reduction" + +namespace { + +class X86PartialReduction : public FunctionPass { + const DataLayout *DL; + const X86Subtarget *ST; + +public: + static char ID; // Pass identification, replacement for typeid. + + X86PartialReduction() : FunctionPass(ID) { } + + bool runOnFunction(Function &Fn) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + StringRef getPassName() const override { + return "X86 Partial Reduction"; + } + +private: + bool tryMAddPattern(BinaryOperator *BO); + bool tryMAddReplacement(Value *Op, BinaryOperator *Add); + + bool trySADPattern(BinaryOperator *BO); + bool trySADReplacement(Value *Op, BinaryOperator *Add); +}; +} + +FunctionPass *llvm::createX86PartialReductionPass() { + return new X86PartialReduction(); +} + +char X86PartialReduction::ID = 0; + +INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, + "X86 Partial Reduction", false, false) + +static bool isVectorReductionOp(const BinaryOperator &BO) { + if (!BO.getType()->isVectorTy()) + return false; + + unsigned Opcode = BO.getOpcode(); + + switch (Opcode) { + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + break; + case Instruction::FAdd: + case Instruction::FMul: + if (auto *FPOp = dyn_cast(&BO)) + if (FPOp->getFastMathFlags().isFast()) + break; + LLVM_FALLTHROUGH; + default: + return false; + } + + unsigned ElemNum = BO.getType()->getVectorNumElements(); + // Ensure the reduction size is a power of 2. + if (!isPowerOf2_32(ElemNum)) + return false; + + unsigned ElemNumToReduce = ElemNum; + + // Do DFS search on the def-use chain from the given instruction. We only + // allow four kinds of operations during the search until we reach the + // instruction that extracts the first element from the vector: + // + // 1. The reduction operation of the same opcode as the given instruction. + // + // 2. PHI node. + // + // 3. ShuffleVector instruction together with a reduction operation that + // does a partial reduction. + // + // 4. ExtractElement that extracts the first element from the vector, and we + // stop searching the def-use chain here. + // + // 3 & 4 above perform a reduction on all elements of the vector. We push defs + // from 1-3 to the stack to continue the DFS. The given instruction is not + // a reduction operation if we meet any other instructions other than those + // listed above. + + SmallVector UsersToVisit{&BO}; + SmallPtrSet Visited; + bool ReduxExtracted = false; + + while (!UsersToVisit.empty()) { + auto User = UsersToVisit.back(); + UsersToVisit.pop_back(); + if (!Visited.insert(User).second) + continue; + + for (const auto *U : User->users()) { + auto *Inst = dyn_cast(U); + if (!Inst) + return false; + + if (Inst->getOpcode() == Opcode || isa(U)) { + if (auto *FPOp = dyn_cast(Inst)) + if (!isa(FPOp) && !FPOp->getFastMathFlags().isFast()) + return false; + UsersToVisit.push_back(U); + } else if (auto *ShufInst = dyn_cast(U)) { + // Detect the following pattern: A ShuffleVector instruction together + // with a reduction that do partial reduction on the first and second + // ElemNumToReduce / 2 elements, and store the result in + // ElemNumToReduce / 2 elements in another vector. + + unsigned ResultElements = ShufInst->getType()->getVectorNumElements(); + if (ResultElements < ElemNum) + return false; + + if (ElemNumToReduce == 1) + return false; + if (!isa(U->getOperand(1))) + return false; + for (unsigned i = 0; i < ElemNumToReduce / 2; ++i) + if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2)) + return false; + for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i) + if (ShufInst->getMaskValue(i) != -1) + return false; + + // There is only one user of this ShuffleVector instruction, which + // must be a reduction operation. + if (!U->hasOneUse()) + return false; + + auto *U2 = dyn_cast(*U->user_begin()); + if (!U2 || U2->getOpcode() != Opcode) + return false; + + // Check operands of the reduction operation. + if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) || + (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) { + UsersToVisit.push_back(U2); + ElemNumToReduce /= 2; + } else + return false; + } else if (isa(U)) { + // At this moment we should have reduced all elements in the vector. + if (ElemNumToReduce != 1) + return false; + + auto *Val = dyn_cast(U->getOperand(1)); + if (!Val || !Val->isZero()) + return false; + + ReduxExtracted = true; + } else + return false; + } + } + return ReduxExtracted; +} + +bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) { + BasicBlock *BB = Add->getParent(); + + auto *BO = dyn_cast(Op); + if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() || + BO->getParent() != BB) + return false; + + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + + // LHS and RHS should be only used once or if they are the same then only + // used twice. Only check this when SSE4.1 is enabled and we have zext/sext + // instructions, otherwise we use punpck to emulate zero extend in stages. The + // trunc/ we need to do likely won't introduce new instructions in that case. + if (ST->hasSSE41()) { + if (LHS == RHS) { + if (!isa(LHS) && !LHS->hasNUses(2)) + return false; + } else { + if (!isa(LHS) && !LHS->hasOneUse()) + return false; + if (!isa(RHS) && !RHS->hasOneUse()) + return false; + } + } + + auto canShrinkOp = [&](Value *Op) { + if (isa(Op) && ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + return true; + if (auto *Cast = dyn_cast(Op)) { + if (Cast->getParent() == BB && + (Cast->getOpcode() == Instruction::SExt || + Cast->getOpcode() == Instruction::ZExt) && + ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16) + return true; + } + + return false; + }; + + // Both Ops need to be shrinkable. + if (!canShrinkOp(LHS) && !canShrinkOp(RHS)) + return false; + + IRBuilder<> Builder(Add); + + Type *MulTy = Op->getType(); + unsigned NumElts = MulTy->getVectorNumElements(); + + // Extract even elements and odd elements and add them together. This will + // be pattern matched by SelectionDAG to pmaddwd. This instruction will be + // half the original width. + SmallVector EvenMask(NumElts / 2); + SmallVector OddMask(NumElts / 2); + for (int i = 0, e = NumElts / 2; i != e; ++i) { + EvenMask[i] = i * 2; + OddMask[i] = i * 2 + 1; + } + Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask); + Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask); + Value *MAdd = Builder.CreateAdd(EvenElts, OddElts); + + // Concatenate zeroes to extend back to the original type. + SmallVector ConcatMask(NumElts); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Value *Zero = Constant::getNullValue(MAdd->getType()); + Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask); + + // Replaces the use of mul in the original Add with the pmaddwd and zeroes. + Add->replaceUsesOfWith(BO, Concat); + Add->setHasNoSignedWrap(false); + Add->setHasNoUnsignedWrap(false); + + return true; +} + +// Try to replace operans of this add with pmaddwd patterns. +bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) { + if (!ST->hasSSE2()) + return false; + + // Need at least 8 elements. + if (BO->getType()->getVectorNumElements() < 8) + return false; + + // Element type should be i32. + if (!BO->getType()->getVectorElementType()->isIntegerTy(32)) + return false; + + bool Changed = false; + Changed |= tryMAddReplacement(BO->getOperand(0), BO); + Changed |= tryMAddReplacement(BO->getOperand(1), BO); + return Changed; +} + +bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) { + // Operand should be a select. + auto *SI = dyn_cast(Op); + if (!SI) + return false; + + // Select needs to implement absolute value. + Value *LHS, *RHS; + auto SPR = matchSelectPattern(SI, LHS, RHS); + if (SPR.Flavor != SPF_ABS) + return false; + + // Need a subtract of two values. + auto *Sub = dyn_cast(LHS); + if (!Sub || Sub->getOpcode() != Instruction::Sub) + return false; + + // Look for zero extend from i8. + auto getZeroExtendedVal = [](Value *Op) -> Value * { + if (auto *ZExt = dyn_cast(Op)) + if (ZExt->getOperand(0)->getType()->getVectorElementType()->isIntegerTy(8)) + return ZExt->getOperand(0); + + return nullptr; + }; + + // Both operands of the subtract should be extends from vXi8. + Value *Op0 = getZeroExtendedVal(Sub->getOperand(0)); + Value *Op1 = getZeroExtendedVal(Sub->getOperand(1)); + if (!Op0 || !Op1) + return false; + + IRBuilder<> Builder(Add); + + Type *OpTy = Op->getType(); + unsigned NumElts = OpTy->getVectorNumElements(); + + unsigned IntrinsicNumElts; + Intrinsic::ID IID; + if (ST->hasBWI() && NumElts >= 64) { + IID = Intrinsic::x86_avx512_psad_bw_512; + IntrinsicNumElts = 64; + } else if (ST->hasAVX2() && NumElts >= 32) { + IID = Intrinsic::x86_avx2_psad_bw; + IntrinsicNumElts = 32; + } else { + IID = Intrinsic::x86_sse2_psad_bw; + IntrinsicNumElts = 16; + } + + Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID); + + if (NumElts < 16) { + // Pad input with zeroes. + SmallVector ConcatMask(16); + for (unsigned i = 0; i != NumElts; ++i) + ConcatMask[i] = i; + for (unsigned i = NumElts; i != 16; ++i) + ConcatMask[i] = (i % NumElts) + NumElts; + + Value *Zero = Constant::getNullValue(Op0->getType()); + Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask); + Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask); + NumElts = 16; + } + + // Intrinsics produce vXi64 and need to be casted to vXi32. + Type *I32Ty = VectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4); + + assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!"); + unsigned NumSplits = NumElts / IntrinsicNumElts; + + // First collect the pieces we need. + SmallVector Ops(NumSplits); + for (unsigned i = 0; i != NumSplits; ++i) { + SmallVector ExtractMask(IntrinsicNumElts); + std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts); + Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask); + Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask); + Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1}); + Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty); + } + + assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits"); + unsigned Stages = Log2_32(NumSplits); + for (unsigned s = Stages; s > 0; --s) { + unsigned NumConcatElts = Ops[0]->getType()->getVectorNumElements() * 2; + for (unsigned i = 0; i != 1 << (s - 1); ++i) { + SmallVector ConcatMask(NumConcatElts); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask); + } + } + + // At this point the final value should be in Ops[0]. Now we need to adjust + // it to the final original type. + NumElts = OpTy->getVectorNumElements(); + if (NumElts == 2) { + // Extract down to 2 elements. + Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], {0, 1}); + } else if (NumElts >= 8) { + SmallVector ConcatMask(NumElts); + unsigned SubElts = Ops[0]->getType()->getVectorNumElements(); + for (unsigned i = 0; i != SubElts; ++i) + ConcatMask[i] = i; + for (unsigned i = SubElts; i != NumElts; ++i) + ConcatMask[i] = (i % SubElts) + SubElts; + + Value *Zero = Constant::getNullValue(Ops[0]->getType()); + Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask); + } + + // Replaces the uses of Op in Add with the new sequence. + Add->replaceUsesOfWith(Op, Ops[0]); + Add->setHasNoSignedWrap(false); + Add->setHasNoUnsignedWrap(false); + + return false; +} + +bool X86PartialReduction::trySADPattern(BinaryOperator *BO) { + if (!ST->hasSSE2()) + return false; + + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. + if (!BO->getType()->getVectorElementType()->isIntegerTy(32)) + return false; + + bool Changed = false; + Changed |= trySADReplacement(BO->getOperand(0), BO); + Changed |= trySADReplacement(BO->getOperand(1), BO); + return Changed; +} + +bool X86PartialReduction::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + auto &TM = TPC->getTM(); + ST = TM.getSubtargetImpl(F); + + DL = &F.getParent()->getDataLayout(); + + bool MadeChange = false; + for (auto &BB : F) { + for (auto &I : BB) { + auto *BO = dyn_cast(&I); + if (!BO) + continue; + + if (!isVectorReductionOp(*BO)) + continue; + + if (BO->getOpcode() == Instruction::Add) { + if (tryMAddPattern(BO)) { + MadeChange = true; + continue; + } + if (trySADPattern(BO)) { + MadeChange = true; + continue; + } + } + } + } + + return MadeChange; +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -84,6 +84,7 @@ initializeX86FlagsCopyLoweringPassPass(PR); initializeX86CondBrFoldingPassPass(PR); initializeX86OptimizeLEAPassPass(PR); + initializeX86PartialReductionPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -400,8 +401,10 @@ TargetPassConfig::addIRPasses(); - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + addPass(createX86PartialReductionPass()); + } // Add passes that handle indirect branch removal and insertion of a retpoline // thunk. These will be a no-op unless a function subtarget has the retpoline diff --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll --- a/llvm/test/CodeGen/X86/O3-pipeline.ll +++ b/llvm/test/CodeGen/X86/O3-pipeline.ll @@ -51,6 +51,7 @@ ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Interleaved Access Pass +; CHECK-NEXT: X86 Partial Reduction ; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -236,10 +236,10 @@ ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 ; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 ; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx @@ -407,16 +407,16 @@ ; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 ; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 ; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 +; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 +; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 +; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 ; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx @@ -453,10 +453,10 @@ ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 ; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 -; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB3_1 @@ -779,18 +779,18 @@ ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm3 ; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE2-NEXT: psraw $8, %xmm6 ; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: pmaddwd %xmm3, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 @@ -814,16 +814,16 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm2 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 ; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB6_1 @@ -943,34 +943,34 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm10 -; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm7 -; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm9 -; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10 +; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm0 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm6 ; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm0 ; SSE2-NEXT: pmaddwd %xmm5, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: pmaddwd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: pmaddwd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 @@ -999,26 +999,26 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB7_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm4 -; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm5 -; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm6 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 -; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 ; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB7_1 @@ -1051,14 +1051,14 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB7_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 -; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3 -; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $32, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB7_1 @@ -1913,9 +1913,9 @@ ; ; AVX1-LABEL: pmaddwd_16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1944,16 +1944,16 @@ ; ; AVX1-LABEL: pmaddwd_32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddwd_32: @@ -1964,9 +1964,9 @@ ; ; AVX512F-LABEL: pmaddwd_32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -2126,9 +2126,9 @@ ; ; AVX1-LABEL: jumbled_indices8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2157,16 +2157,16 @@ ; ; AVX1-LABEL: jumbled_indices16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: jumbled_indices16: @@ -2177,9 +2177,9 @@ ; ; AVX512F-LABEL: jumbled_indices16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq @@ -2221,26 +2221,26 @@ ; ; AVX1-LABEL: jumbled_indices32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm8, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm9, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm10, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm12 -; AVX1-NEXT: vpmaddwd %xmm12, %xmm11, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: retq ; ; AVX2-LABEL: jumbled_indices32: @@ -2656,7 +2656,7 @@ ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] @@ -2698,14 +2698,14 @@ ; SSE2-NEXT: movdqu (%r8), %xmm0 ; SSE2-NEXT: movdqu (%r9), %xmm3 ; SSE2-NEXT: pmaddwd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm3 ; SSE2-NEXT: movdqu (%rax), %xmm0 -; SSE2-NEXT: movdqu (%r10), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqu (%r10), %xmm1 +; SSE2-NEXT: pmaddwd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -2721,11 +2721,11 @@ ; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%r8), %xmm2 ; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rax), %xmm2 ; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -180,14 +180,14 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB8_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 -; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 ; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3 -; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 ; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-NEXT: addq $32, %rcx ; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: jne .LBB8_1 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -158,12 +158,12 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 -; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 +; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block @@ -188,14 +188,14 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm2 -; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3 -; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm2 +; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3 +; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: addq $4, %rax ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %middle.block @@ -320,15 +320,15 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm5 -; SSE2-NEXT: psadbw b+1056(%rax), %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 -; SSE2-NEXT: psadbw b+1040(%rax), %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm5 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1040(%rax), %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1056(%rax), %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: movdqa a+1072(%rax), %xmm5 ; SSE2-NEXT: psadbw b+1072(%rax), %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm1 @@ -364,22 +364,22 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm3 -; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm4 -; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3 -; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm4 -; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3 +; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm4 +; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm5 +; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm6 +; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: addq $4, %rax ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %middle.block @@ -416,12 +416,12 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3 -; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3 ; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3 +; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %middle.block @@ -449,11 +449,11 @@ ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB2_1: # %vector.body ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm2 -; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm3 -; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2 +; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm3 +; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 @@ -554,10 +554,10 @@ ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psadbw %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: psadbw %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block @@ -576,8 +576,8 @@ ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX-NEXT: addq $4, %rax @@ -649,7 +649,7 @@ ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: addq $4, %rax ; AVX-NEXT: jne .LBB4_1 @@ -1112,7 +1112,7 @@ ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 ; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]