Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -596,12 +596,12 @@ /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(ArrayRef VL, + void reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); /// \reorder commutative operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(ArrayRef VL, + void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { @@ -1302,6 +1302,38 @@ } } +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Type::getInt32Ty(Ty->getContext())); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { bool isAltShuffle = false; @@ -1635,7 +1667,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right); + reorderInputsAccordingToOpcode(VL0->getOpcode(), VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -1799,7 +1831,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(VL, Left, Right); + reorderAltShuffleOperands(VL0->getOpcode(), VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -2344,13 +2376,20 @@ // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, +void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right - for (Value *i : VL) { - Left.push_back(cast(i)->getOperand(0)); - Right.push_back(cast(i)->getOperand(1)); + unsigned AltOpcode = getAltOpcode(Opcode); + for (Value *V : VL) { + auto *I = cast(V); + if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2395,14 +2434,17 @@ // The vectorizer is trying to either have all elements one side being // instruction with the same opcode to enable further vectorization, or having // a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, Instruction &I, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - bool AllSameOpcodeLeft, - bool AllSameOpcodeRight, bool SplatLeft, - bool SplatRight) { - Value *VLeft = I.getOperand(0); - Value *VRight = I.getOperand(1); +static bool shouldReorderOperands( + int i, unsigned Opcode, Instruction &I, ArrayRef Left, + ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, + bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2458,15 +2500,24 @@ return false; } -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, +void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, + ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { if (VL.size()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. - auto VLeft = cast(VL[0])->getOperand(0); - auto VRight = cast(VL[0])->getOperand(1); + auto *I = cast(VL[0]); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2483,16 +2534,21 @@ for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); - assert(I->isCommutative() && "Can only process commutative instruction"); + assert(((I->getOpcode() == Opcode && I->isCommutative()) || + (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && + "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight)) { - Left.push_back(I->getOperand(1)); - Right.push_back(I->getOperand(0)); + Value *VLeft; + Value *VRight; + if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, + VRight)) { + Left.push_back(VRight); + Right.push_back(VLeft); } else { - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + Left.push_back(VLeft); + Right.push_back(VRight); } // Update Splat* and AllSameOpcode* after the insertion. SplatRight = SplatRight && (Right[i - 1] == Right[i]); @@ -2843,11 +2899,19 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + reorderInputsAccordingToOpcode(VL0->getOpcode(), + E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { - LHSVL.push_back(cast(V)->getOperand(0)); - RHSVL.push_back(cast(V)->getOperand(1)); + auto *I = cast(V); + if (I->getOpcode() == VL0->getOpcode()) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(VL0->getOpcode(), I->getType())); + } } setInsertPointAfterBundle(E->Scalars, VL0); @@ -3011,7 +3075,7 @@ case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; assert(isa(VL0) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); + reorderAltShuffleOperands(VL0->getOpcode(), E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); Index: test/Transforms/SLPVectorizer/X86/reorder.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/reorder.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -slp-vectorizer -slp-threshold=-10 -mcpu=skx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common global float 0.000000e+00, align 4 +@p = common global [1 x i32] zeroinitializer, align 4 +@q = common global [1 x i32] zeroinitializer, align 4 +@c = common global i32 0, align 4 + +define i32 @foo() local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* @b, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float -0.000000e+00, i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float 1.000000e+00, i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = fsub <4 x float> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fptosi <4 x float> [[TMP11]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([1 x i32]* @p to <4 x i32>*), align 4 +; CHECK-NEXT: ret i32 undef +; +entry: + %0 = load float, float* @b, align 4 + %1 = fmul float %0, 0.000000e+00 + %mul = fsub float -0.000000e+00, %1 + %conv1 = fptosi float %mul to i32 + store i32 %conv1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @p, i64 0, i64 0), align 4 + %add = fadd float %0, 0.000000e+00 + %conv4 = fptosi float %add to i32 + store i32 %conv4, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @p, i64 1, i64 0), align 4 + %mul.1 = fsub float -0.000000e+00, %0 + %conv1.1 = fptosi float %mul.1 to i32 + store i32 %conv1.1, i32* getelementptr ([1 x i32], [1 x i32]* @p, i64 2, i64 0), align 4 + %add.1 = fadd float %0, 1.000000e+00 + %conv4.1 = fptosi float %add.1 to i32 + store i32 %conv4.1, i32* getelementptr ([1 x i32], [1 x i32]* @p, i64 3, i64 0), align 4 + ret i32 undef +} + +define i32 @foo1() local_unnamed_addr #0 { +; CHECK-LABEL: @foo1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 3, i64 0), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 7, i64 0), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 11, i64 0), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 15, i64 0), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 19, i64 0), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 23, i64 0), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 27, i64 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 31, i64 0), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP15]] +; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* bitcast ([1 x i32]* @p to <4 x i32>*), align 4 +; CHECK-NEXT: ret i32 undef +; +entry: + %0 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 3, i64 0), align 4 + %1 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 7, i64 0), align 4 + %add5 = add nsw i32 %1, %0 + store i32 %add5, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @p, i64 0, i64 0), align 4 + %2 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 11, i64 0), align 4 + %3 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 15, i64 0), align 4 + %add5.1 = add nsw i32 %3, %2 + store i32 %add5.1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @p, i64 1, i64 0), align 4 + %4 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 19, i64 0), align 4 + %5 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 23, i64 0), align 4 + %add5.2 = add nsw i32 %5, %4 + store i32 %add5.2, i32* getelementptr ([1 x i32], [1 x i32]* @p, i64 2, i64 0), align 4 + %6 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 27, i64 0), align 4 + %7 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 31, i64 0), align 4 + %add5.3 = add nsw i32 %7, %6 + store i32 %add5.3, i32* getelementptr ([1 x i32], [1 x i32]* @p, i64 3, i64 0), align 4 + ret i32 undef +} + +define i32 @foo2() local_unnamed_addr #0 { +; CHECK-LABEL: @foo2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @q to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([1 x i32]* @p to <4 x i32>*), align 4 +; CHECK-NEXT: store i32 4, i32* @c, align 4 +; CHECK-NEXT: ret i32 undef +; +entry: + %0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @q, i64 0, i64 0), align 4 + %shr = ashr i32 %0, 3 + store i32 %shr, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @p, i64 0, i64 0), align 4 + %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @q, i64 1, i64 0), align 4 + %shr.1 = ashr i32 %1, 3 + store i32 %shr.1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @p, i64 1, i64 0), align 4 + %2 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 2, i64 0), align 4 + %shr.2 = ashr i32 %2, 3 + store i32 %shr.2, i32* getelementptr ([1 x i32], [1 x i32]* @p, i64 2, i64 0), align 4 + %3 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @q, i64 3, i64 0), align 4 + %shr.3 = ashr i32 %3, 3 + store i32 %shr.3, i32* getelementptr ([1 x i32], [1 x i32]* @p, i64 3, i64 0), align 4 + store i32 4, i32* @c, align 4 + ret i32 undef +}