Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -674,13 +674,6 @@ /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); - /// \reorder commutative operands in alt shuffle if they result in - /// vectorized code. - void reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right); - /// \reorder commutative operands to get better probability of /// generating vectorized code. void reorderInputsAccordingToOpcode(const InstructionsState &S, @@ -2072,7 +2065,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(S, VL, Left, Right); + reorderInputsAccordingToOpcode(S, VL, Left, Right); UserTreeIdx.EdgeIdx = 0; buildTree_rec(Left, Depth + 1, UserTreeIdx); UserTreeIdx.EdgeIdx = 1; @@ -2787,63 +2780,6 @@ return getGatherCost(VecTy, ShuffledElements); } -// Reorder commutative operations in alternate shuffle if the resulting vectors -// are consecutive loads. This would allow us to vectorize the tree. -// If we have something like- -// load a[0] - load b[0] -// load b[1] + load a[1] -// load a[2] - load b[2] -// load a[3] + load b[3] -// Reordering the second load b[1] load a[1] would allow us to vectorize this -// code. -void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right) { - // Push left and right operands of binary operation into Left and Right - for (Value *V : VL) { - auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - } - - // Reorder if we have a commutative operation and consecutive access - // are on either side of the alternate instructions. - for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { - if (LoadInst *L = dyn_cast(Left[j])) { - if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - if (LoadInst *L = dyn_cast(Right[j])) { - if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); - if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j], Right[j]); - continue; - } else if (VL2->isCommutative() && - isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; - } - // else unchanged - } - } - } -} - // Return true if the i'th left and right operands can be commuted. // // The vectorizer is trying to either have all elements one side being @@ -2918,10 +2854,12 @@ ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { - if (!VL.empty()) { - // Peel the first iteration out of the loop since there's nothing - // interesting to do anyway and it simplifies the checks in the loop. - auto *I = cast(VL[0]); + assert(!VL.empty() && Left.empty() && Right.empty()); + + // Push left and right operands of binary operation into Left and Right + for (Value *V : VL) { + auto *I = cast(V); + assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); Left.push_back(I->getOperand(0)); Right.push_back(I->getOperand(1)); } @@ -2935,15 +2873,10 @@ for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); - assert(((I->getOpcode() == S.getOpcode() && I->isCommutative()) || - (I->getOpcode() != S.getOpcode() && - Instruction::isCommutative(S.getOpcode()))) && - "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - if (shouldReorderOperands(i, Left, Right, AllSameOpcodeLeft, + if (I->isCommutative() && + shouldReorderOperands(i, Left, Right, AllSameOpcodeLeft, AllSameOpcodeRight, SplatLeft, SplatRight)) std::swap(Left[i], Right[i]); @@ -2965,11 +2898,11 @@ // Finally check if we can get longer vectorizable chain by reordering // without breaking the good operand order detected above. // E.g. If we have something like- - // load a[0] load b[0] - // load b[1] load a[1] - // load a[2] load b[2] - // load a[3] load b[3] - // Reordering the second load b[1] load a[1] would allow us to vectorize + // load a[0] - load b[0] + // load b[1] + load a[1] + // load a[2] - load b[2] + // load a[3] + load b[3] + // Reordering the second load b[1] + load a[1] would allow us to vectorize // this code and we still retain AllSameOpcode property. // FIXME: This load reordering might break AllSameOpcode in some rare cases // such as- @@ -2981,16 +2914,30 @@ if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; + Instruction *VL1 = cast(VL[j]); + Instruction *VL2 = cast(VL[j + 1]); + if (VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } else if (VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } } } } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { - std::swap(Left[j + 1], Right[j + 1]); - continue; + Instruction *VL1 = cast(VL[j]); + Instruction *VL2 = cast(VL[j + 1]); + if (VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } else if (VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } } } } Index: test/Transforms/SLPVectorizer/X86/PR39774.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR39774.ll +++ test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -81,9 +81,9 @@ ; CHECK-NEXT: [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]] ; CHECK-NEXT: [[VAL_42:%.*]] = and i32 [[VAL_40]], undef ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32>