Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -596,12 +596,12 @@ /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(ArrayRef VL, + void reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); /// \reorder commutative operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(ArrayRef VL, + void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { @@ -1302,6 +1302,38 @@ } } +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Type::getInt32Ty(Ty->getContext())); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { bool isAltShuffle = false; @@ -1635,7 +1667,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right); + reorderInputsAccordingToOpcode(VL0->getOpcode(), VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -1799,7 +1831,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(VL, Left, Right); + reorderAltShuffleOperands(VL0->getOpcode(), VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); buildTree_rec(Right, Depth + 1, UserTreeIdx); return; @@ -2344,13 +2376,20 @@ // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, +void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right - for (Value *i : VL) { - Left.push_back(cast(i)->getOperand(0)); - Right.push_back(cast(i)->getOperand(1)); + unsigned AltOpcode = getAltOpcode(Opcode); + for (Value *V : VL) { + auto *I = cast(V); + if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2395,14 +2434,17 @@ // The vectorizer is trying to either have all elements one side being // instruction with the same opcode to enable further vectorization, or having // a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, Instruction &I, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - bool AllSameOpcodeLeft, - bool AllSameOpcodeRight, bool SplatLeft, - bool SplatRight) { - Value *VLeft = I.getOperand(0); - Value *VRight = I.getOperand(1); +static bool shouldReorderOperands( + int i, unsigned Opcode, Instruction &I, ArrayRef Left, + ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, + bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2458,15 +2500,24 @@ return false; } -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, +void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, + ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { if (VL.size()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. - auto VLeft = cast(VL[0])->getOperand(0); - auto VRight = cast(VL[0])->getOperand(1); + auto *I = cast(VL[0]); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2483,16 +2534,21 @@ for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); - assert(I->isCommutative() && "Can only process commutative instruction"); + assert(((I->getOpcode() == Opcode && I->isCommutative()) || + (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && + "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight)) { - Left.push_back(I->getOperand(1)); - Right.push_back(I->getOperand(0)); + Value *VLeft; + Value *VRight; + if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, + VRight)) { + Left.push_back(VRight); + Right.push_back(VLeft); } else { - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + Left.push_back(VLeft); + Right.push_back(VRight); } // Update Splat* and AllSameOpcode* after the insertion. SplatRight = SplatRight && (Right[i - 1] == Right[i]); @@ -2843,11 +2899,19 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + reorderInputsAccordingToOpcode(VL0->getOpcode(), + E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { - LHSVL.push_back(cast(V)->getOperand(0)); - RHSVL.push_back(cast(V)->getOperand(1)); + auto *I = cast(V); + if (I->getOpcode() == VL0->getOpcode()) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(VL0->getOpcode(), I->getType())); + } } setInsertPointAfterBundle(E->Scalars, VL0); @@ -3011,7 +3075,7 @@ case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; assert(isa(VL0) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); + reorderAltShuffleOperands(VL0->getOpcode(), E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); Index: test/Transforms/SLPVectorizer/X86/reorder.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/reorder.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=skx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@b = common global float 0.000000e+00, align 4 +@c = common global [1 x i32] zeroinitializer, align 4 +@d = common global [1 x i32] zeroinitializer, align 4 + +define i32 @fn1() local_unnamed_addr #0 { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i32 ()* @fn1 to <2 x i8>*), align 1 +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> , [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i8> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[OR]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @fn2() +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret i32 undef +; +entry: + %0 = load i8, i8* bitcast (i32 ()* @fn1 to i8*), align 1 + %1 = load i8, i8* getelementptr (i8, i8* bitcast (i32 ()* @fn1 to i8*), i64 1), align 1 + %2 = and i8 %0, 52 + %and = zext i8 %2 to i32 + %shl = shl nuw nsw i32 %and, 3 + %3 = and i8 %1, 8 + %and2 = zext i8 %3 to i32 + %shl3 = shl nuw nsw i32 %and2, 5 + %or = or i32 %shl3, %shl + %tobool = icmp eq i32 %or, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call i32 (...) @fn2() #2 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret i32 undef +} + +declare i32 @fn2(...) #1 + +define i32 @fn3() local_unnamed_addr #0 { +; CHECK-LABEL: @fn3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* @b, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 0, i64 0), align 4 +; CHECK-NEXT: [[CONV5:%.*]] = sitofp i32 [[TMP2]] to float +; CHECK-NEXT: [[MUL6:%.*]] = fmul float [[TMP0]], [[CONV5]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 1, i64 0), align 4 +; CHECK-NEXT: [[CONV5_1:%.*]] = sitofp i32 [[TMP3]] to float +; CHECK-NEXT: [[MUL6_1:%.*]] = fmul float [[TMP0]], [[CONV5_1]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float -0.000000e+00, i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[MUL6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[MUL6_1]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = fsub <4 x float> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fptosi <4 x float> [[TMP13]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* bitcast ([1 x i32]* @d to <4 x i32>*), align 4 +; CHECK-NEXT: ret i32 undef +; +entry: + %0 = load float, float* @b, align 4 + %1 = fmul float %0, 0.000000e+00 + %mul = fsub float -0.000000e+00, %1 + %conv1 = fptosi float %mul to i32 + store i32 %conv1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @d, i64 0, i64 0), align 4 + %2 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 0, i64 0), align 4 + %conv5 = sitofp i32 %2 to float + %mul6 = fmul float %0, %conv5 + %add = fadd float %0, %mul6 + %conv7 = fptosi float %add to i32 + store i32 %conv7, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @d, i64 1, i64 0), align 4 + %mul.1 = fsub float -0.000000e+00, %0 + %conv1.1 = fptosi float %mul.1 to i32 + store i32 %conv1.1, i32* getelementptr ([1 x i32], [1 x i32]* @d, i64 2, i64 0), align 4 + %3 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @c, i64 1, i64 0), align 4 + %conv5.1 = sitofp i32 %3 to float + %mul6.1 = fmul float %0, %conv5.1 + %add.1 = fadd float %0, %mul6.1 + %conv7.1 = fptosi float %add.1 to i32 + store i32 %conv7.1, i32* getelementptr ([1 x i32], [1 x i32]* @d, i64 3, i64 0), align 4 + ret i32 undef +}