Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -95,6 +95,7 @@ bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + bool foldShuffleOfBinops(Instruction &I); void replaceValue(Value &Old, Value &New) { Old.replaceAllUsesWith(&New); @@ -1058,6 +1059,60 @@ return true; } +/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into +/// "binop (shuffle), (shuffle)". +bool VectorCombine::foldShuffleOfBinops(Instruction &I) { + auto *VecTy = dyn_cast(I.getType()); + if (!VecTy) + return false; + + BinaryOperator *B0, *B1; + ArrayRef Mask; + if (!match(&I, m_Shuffle(m_OneUse(m_BinOp(B0)), m_OneUse(m_BinOp(B1)), + m_Mask(Mask))) || + B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy) + return false; + + // Try to replace a binop with a shuffle if the shuffle is not costly. + // The new shuffle will choose from a single, common operand, so it may be + // cheaper than the existing two-operand shuffle. + SmallVector UnaryMask = createUnaryMask(Mask, Mask.size()); + Instruction::BinaryOps Opcode = B0->getOpcode(); + InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, VecTy); + InstructionCost ShufCost = TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, UnaryMask); + if (ShufCost > BinopCost) + return false; + + // If we have something like "add X, Y" and "add Z, X", swap ops to match. + Value *X = B0->getOperand(0), *Y = B0->getOperand(1); + Value *Z = B1->getOperand(0), *W = B1->getOperand(1); + if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W) + std::swap(X, Y); + + Value *Shuf0, *Shuf1; + if (X == Z) { + // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W) + Shuf0 = Builder.CreateShuffleVector(X, UnaryMask); + Shuf1 = Builder.CreateShuffleVector(Y, W, Mask); + } else if (Y == W) { + // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y) + Shuf0 = Builder.CreateShuffleVector(X, Z, Mask); + Shuf1 = Builder.CreateShuffleVector(Y, UnaryMask); + } else { + return false; + } + + Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1); + // Intersect flags from the old binops. + if (auto *NewInst = dyn_cast(NewBO)) { + NewInst->copyIRFlags(B0); + NewInst->andIRFlags(B1); + } + replaceValue(I, *NewBO); + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -1078,6 +1133,7 @@ MadeChange |= foldExtractedCmps(I); MadeChange |= scalarizeLoadExtract(I); MadeChange |= foldSingleElementStore(I); + MadeChange |= foldShuffleOfBinops(I); }; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. Index: llvm/test/Transforms/VectorCombine/X86/shuffle.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -151,11 +151,13 @@ ret <2 x i64> %bc3 } +; Shuffle is much cheaper than fdiv. FMF are intersected. + define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) { ; CHECK-LABEL: @shuf_fdiv_v4f32_yy( -; CHECK-NEXT: [[B0:%.*]] = fdiv fast <4 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = fdiv arcp <4 x float> [[Z:%.*]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Z:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = fdiv arcp <4 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x float> [[R]] ; %b0 = fdiv fast <4 x float> %x, %y @@ -164,11 +166,13 @@ ret <4 x float> %r } +; Common operand is op0 of the binops. + define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: @shuf_add_v4i32_xx( -; CHECK-NEXT: [[B0:%.*]] = add <4 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = add <4 x i32> [[X]], [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x i32> [[R]] ; %b0 = add <4 x i32> %x, %y @@ -177,11 +181,13 @@ ret <4 x i32> %r } +; For commutative instructions, common operand may be swapped. + define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) { ; CHECK-LABEL: @shuf_fmul_v4f32_xx_swap( -; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z:%.*]], [[X]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x float> [[R]] ; %b0 = fmul <4 x float> %x, %y @@ -190,11 +196,13 @@ ret <4 x float> %r } +; For commutative instructions, common operand may be swapped. + define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: @shuf_and_v2i64_yy_swap( -; CHECK-NEXT: [[B0:%.*]] = and <2 x i64> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = and <2 x i64> [[Y]], [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y:%.*]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X:%.*]], <2 x i64> [[Z:%.*]], <2 x i32> +; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <2 x i64> [[R]] ; %b0 = and <2 x i64> %x, %y @@ -203,11 +211,13 @@ ret <2 x i64> %r } +; non-commutative binop, but common op0 + define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: @shuf_shl_v4i32_xx( -; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[X]], [[Z:%.*]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x i32> [[R]] ; %b0 = shl <4 x i32> %x, %y @@ -216,6 +226,8 @@ ret <4 x i32> %r } +; negative test - common operand, but not commutable + define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: @shuf_shl_v4i32_xx_swap( ; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] @@ -229,6 +241,8 @@ ret <4 x i32> %r } +; negative test - mismatched opcodes + define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: @shuf_sub_add_v2i64_yy( ; CHECK-NEXT: [[B0:%.*]] = sub <2 x i64> [[X:%.*]], [[Y:%.*]] @@ -242,6 +256,8 @@ ret <2 x i64> %r } +; negative test - type change via shuffle + define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) { ; CHECK-LABEL: @shuf_fmul_v4f32_xx_type( ; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]] @@ -255,6 +271,8 @@ ret <8 x float> %r } +; negative test - uses + define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: @shuf_lshr_v4i32_yy_use1( ; CHECK-NEXT: [[B0:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] @@ -270,6 +288,8 @@ ret <4 x i32> %r } +; negative test - uses + define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: @shuf_mul_v4i32_yy_use2( ; CHECK-NEXT: [[B0:%.*]] = mul <4 x i32> [[X:%.*]], [[Y:%.*]] @@ -285,6 +305,8 @@ ret <4 x i32> %r } +; negative test - must have matching operand + define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { ; CHECK-LABEL: @shuf_fadd_v4f32_no_common_op( ; CHECK-NEXT: [[B0:%.*]] = fadd <4 x float> [[X:%.*]], [[Y:%.*]] @@ -298,6 +320,8 @@ ret <4 x float> %r } +; negative test - binops may be relatively cheap + define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) { ; CHECK-LABEL: @shuf_and_v16i16_yy_expensive_shuf( ; CHECK-NEXT: [[B0:%.*]] = and <16 x i16> [[X:%.*]], [[Y:%.*]]