Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -596,12 +596,12 @@ /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(ArrayRef VL, + void reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); /// \reorder commutative operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(ArrayRef VL, + void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { @@ -1635,7 +1635,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right); + reorderInputsAccordingToOpcode(VL0->getOpcode(), VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -1799,7 +1799,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(VL, Left, Right); + reorderAltShuffleOperands(VL0->getOpcode(), VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -2344,13 +2344,17 @@ // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, +void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right - for (Value *i : VL) { - Left.push_back(cast(i)->getOperand(0)); - Right.push_back(cast(i)->getOperand(1)); + unsigned AltOpcode = getAltOpcode(Opcode); + for (Value *V : VL) { + auto *I = cast(V); + assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && + "Incorrect instruction in vector"); + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); } // Reorder if we have a commutative operation and consecutive access @@ -2395,14 +2399,12 @@ // The vectorizer is trying to either have all elements one side being // instruction with the same opcode to enable further vectorization, or having // a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, Instruction &I, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - bool AllSameOpcodeLeft, - bool AllSameOpcodeRight, bool SplatLeft, - bool SplatRight) { - Value *VLeft = I.getOperand(0); - Value *VRight = I.getOperand(1); +static bool shouldReorderOperands( + int i, unsigned Opcode, Instruction &I, ArrayRef Left, + ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, + bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2458,15 +2460,19 @@ return false; } -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, +void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, + ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { if (VL.size()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. - auto VLeft = cast(VL[0])->getOperand(0); - auto VRight = cast(VL[0])->getOperand(1); + auto *I = cast(VL[0]); + Value *VLeft; + Value *VRight; + VLeft = I->getOperand(0); + VRight = I->getOperand(1); if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2483,16 +2489,21 @@ for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); - assert(I->isCommutative() && "Can only process commutative instruction"); + assert(((I->getOpcode() == Opcode && I->isCommutative()) || + (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && + "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight)) { - Left.push_back(I->getOperand(1)); - Right.push_back(I->getOperand(0)); + Value *VLeft; + Value *VRight; + if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, + VRight)) { + Left.push_back(VRight); + Right.push_back(VLeft); } else { - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + Left.push_back(VLeft); + Right.push_back(VRight); } // Update Splat* and AllSameOpcode* after the insertion. SplatRight = SplatRight && (Right[i - 1] == Right[i]); @@ -2843,11 +2854,13 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + reorderInputsAccordingToOpcode(VL0->getOpcode(), + E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { - LHSVL.push_back(cast(V)->getOperand(0)); - RHSVL.push_back(cast(V)->getOperand(1)); + auto *I = cast(V); + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); } setInsertPointAfterBundle(E->Scalars, VL0); @@ -3011,7 +3024,7 @@ case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; assert(isa(VL0) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); + reorderAltShuffleOperands(VL0->getOpcode(), E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); Index: test/Transforms/SLPVectorizer/X86/reorder.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/reorder.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = common local_unnamed_addr global i32 0, align 4 +@c = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4 + +define i32 @foo() local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @a, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> , [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; CHECK-NEXT: store i32 [[TMP6]], i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 1, i64 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; CHECK-NEXT: store i32 [[TMP7]], i32* getelementptr ([1 x i32], [1 x i32]* @c, i64 2, i64 0), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: store i32 [[TMP8]], i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 0, i64 0), align 4 +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @c, i64 7, i64 0) to <4 x i32>*), align 4 +; CHECK-NEXT: ret i32 undef +; +entry: + %0 = load i32, i32* @a, align 4 + %add = add nsw i32 %0, 1 + %add1 = add nsw i32 %0, 2 + %add6 = add nsw i32 %0, 3 + %add11 = add nsw i32 %0, 8 + store i32 %add1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 1, i64 0), align 4 + store i32 %add6, i32* getelementptr ([1 x i32], [1 x i32]* @c, i64 2, i64 0), align 4 + store i32 %add11, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 0, i64 0), align 4 + store i32 %add, i32* getelementptr ([1 x i32], [1 x i32]* @c, i64 8, i64 0), align 4 + store i32 %add1, i32* getelementptr ([1 x i32], [1 x i32]* @c, i64 9, i64 0), align 4 + store i32 %add6, i32* getelementptr ([1 x i32], [1 x i32]* @c, i64 10, i64 0), align 4 + store i32 %add11, i32* getelementptr ([1 x i32], [1 x i32]* @c, i64 7, i64 0), align 4 + ret i32 undef +}