Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1941,7 +1941,6 @@ } } - // Return true if I should be commuted before adding it's left and right // operands to the arrays Left and Right. // @@ -1950,60 +1949,76 @@ // a splat to lower the vectorizing cost. static bool shouldReorderOperands(int i, Instruction &I, SmallVectorImpl &Left, - SmallVectorImpl &Right) { + SmallVectorImpl &Right, + bool AllSameOpcodeLeft, + bool AllSameOpcodeRight, bool SplatLeft, + bool SplatRight) { Value *VLeft = I.getOperand(0); Value *VRight = I.getOperand(1); + // If we have "SplatRight", try to see if commuting is needed to preserve it. + if (SplatRight) { + if (VRight == Right[i - 1]) + // Preserve SplatRight + return false; + if (VLeft == Right[i - 1]) { + // Commuting would preserve SplatRight, but we don't want to break + // SplatLeft either, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (SplatLeft && VLeft == Left[i - 1]) + return false; + return true; + } + } + // Symmetrically handle Right side. + if (SplatLeft) { + if (VLeft == Left[i - 1]) + // Preserve SplatLeft + return false; + if (VRight == Left[i - 1]) + return true; + } + Instruction *ILeft = dyn_cast(VLeft); Instruction *IRight = dyn_cast(VRight); - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (ILeft && IRight) { - if (ILeft->getOpcode() > IRight->getOpcode() && - Right[i - 1] != IRight) { - // Try not to destroy a broad cast for no apparent benefit. - return true; - } else if (ILeft->getOpcode() == IRight->getOpcode() && - Right[i - 1] == ILeft) { - // Try preserve broadcasts. - return true; - } else if (ILeft->getOpcode() == IRight->getOpcode() && - Left[i - 1] == IRight) { - // Try preserve broadcasts. + // If we have "AllSameOpcodeRight", try to see if the left operands preserves + // it and not the right, in this case we want to commute. + if (AllSameOpcodeRight) { + unsigned RightPrevOpcode = cast(Right[i - 1])->getOpcode(); + if (IRight && RightPrevOpcode == IRight->getOpcode()) + // Do not commute, a match on the right preserves AllSameOpcodeRight + return false; + if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { + // We have a match and may want to commute, but first check if there is + // not also a match on the existing operands on the Left to preserve + // AllSameOpcodeLeft, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (AllSameOpcodeLeft && ILeft && + cast(Left[i - 1])->getOpcode() == ILeft->getOpcode()) + return false; return true; } - return false; } - // One opcode, put the instruction on the right. - return ILeft != nullptr; + // Symmetrically handle Left side. + if (AllSameOpcodeLeft) { + unsigned LeftPrevOpcode = cast(Left[i - 1])->getOpcode(); + if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) + return false; + if (IRight && LeftPrevOpcode == IRight->getOpcode()) + return true; + } + return false; } void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { - SmallVector OrigLeft, OrigRight; - if (VL.size()) { // Peel the first iteration out of the loop since there's nothing - // interesting to do anyway and it simplifies the checks + // interesting to do anyway and it simplifies the checks in the loop. auto VLeft = cast(VL[0])->getOperand(0); auto VRight = cast(VL[0])->getOperand(1); - OrigLeft.push_back(VLeft); - OrigRight.push_back(VRight); if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2014,60 +2029,38 @@ // Keep track if we have instructions with all the same opcode on one side. bool AllSameOpcodeLeft = isa(Left[0]); bool AllSameOpcodeRight = isa(Right[0]); + // Keep track if we have one side with all the same value (broadcast). + bool SplatLeft = true; + bool SplatRight = true; for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); - - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); - OrigLeft.push_back(VLeft); - OrigRight.push_back(VRight); - Instruction *ILeft = dyn_cast(VLeft); - Instruction *IRight = dyn_cast(VRight); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - if (AllSameOpcodeLeft && ILeft) { - if (Instruction *PLeft = dyn_cast(OrigLeft[i - 1])) { - if (PLeft->getOpcode() != ILeft->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (AllSameOpcodeRight && IRight) { - if (Instruction *PRight = dyn_cast(OrigRight[i - 1])) { - if (PRight->getOpcode() != IRight->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - + assert(I->isCommutative() && "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - if (shouldReorderOperands(i, *I, Left, Right)) { + if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight)) { Left.push_back(I->getOperand(1)); Right.push_back(I->getOperand(0)); } else { Left.push_back(I->getOperand(0)); Right.push_back(I->getOperand(1)); } + // Update Splat* and AllSameOpcode* after the insertion. + SplatRight = SplatRight && (Right[i - 1] == Right[i]); + SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); + AllSameOpcodeLeft = AllSameOpcodeLeft && isa(Left[i]) && + (cast(Left[i - 1])->getOpcode() == + cast(Left[i])->getOpcode()); + AllSameOpcodeRight = AllSameOpcodeRight && isa(Right[i]) && + (cast(Right[i - 1])->getOpcode() == + cast(Right[i])->getOpcode()); } - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // If operands end up being broadcast return this operand order. - if (LeftBroadcast || RightBroadcast) + // If one operand end up being broadcast, return this operand order. + if (SplatRight || SplatLeft) return; - // Don't reorder if the operands where good to begin. - if (AllSameOpcodeRight || AllSameOpcodeLeft) { - Left = OrigLeft; - Right = OrigRight; - } - const DataLayout &DL = F->getParent()->getDataLayout(); // Finally check if we can get longer vectorizable chain by reordering Index: test/Transforms/SLPVectorizer/X86/commutativity.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -0,0 +1,78 @@ +; RUN: opt -slp-vectorizer < %s -S | FileCheck %s + +; Verify that the SLP vectorizer is able to figure out that commutativity +; offers the possibility to splat/broadcast %c and thus make it profitable +; to vectorize this case + + +; ModuleID = 'bugpoint-reduced-simplified.bc' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + +@cle = external unnamed_addr global [32 x i8], align 16 +@cle32 = external unnamed_addr global [32 x i32], align 16 + + +; Check that we correctly detect a splat/broadcast by leveraging the +; commutativity property of `xor`. + +; CHECK-LABEL: @splat +; CHECK: store <16 x i8> +define void @splat(i8 %a, i8 %b, i8 %c) { + %1 = xor i8 %c, %a + store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 + %2 = xor i8 %a, %c + store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1) + %3 = xor i8 %a, %c + store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2) + %4 = xor i8 %a, %c + store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3) + %5 = xor i8 %c, %a + store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4) + %6 = xor i8 %c, %b + store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5) + %7 = xor i8 %c, %a + store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6) + %8 = xor i8 %c, %b + store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7) + %9 = xor i8 %a, %c + store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8) + %10 = xor i8 %a, %c + store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9) + %11 = xor i8 %a, %c + store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10) + %12 = xor i8 %a, %c + store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11) + %13 = xor i8 %a, %c + store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12) + %14 = xor i8 %a, %c + store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13) + %15 = xor i8 %a, %c + store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14) + %16 = xor i8 %a, %c + store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15) + ret void +} + + + +; Check that we correctly detect that we can have the same opcode on one side by +; leveraging the commutativity property of `xor`. + +; CHECK-LABEL: @same_opcode_on_one_side +; CHECK: store <4 x i32> +define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { + %add1 = add i32 %c, %a + %add2 = add i32 %c, %a + %add3 = add i32 %a, %c + %add4 = add i32 %c, %a + %1 = xor i32 %add1, %a + store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16 + %2 = xor i32 %b, %add2 + store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1) + %3 = xor i32 %c, %add3 + store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2) + %4 = xor i32 %a, %add4 + store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3) + ret void +}