Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -30,6 +31,7 @@ #define DEBUG_TYPE "vector-combine" STATISTIC(NumVecCmp, "Number of vector compares formed"); +STATISTIC(NumVecBO, "Number of vector binops formed"); static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) { // Match a cmp with extracted vector operands. @@ -76,6 +78,65 @@ return true; } +/// Try to reduce extract element costs by converting scalar binops to vector +/// binops followed by extract. +static bool foldExtractBinop(Instruction &I, const TargetTransformInfo &TTI) { + // It is not safe to transform things like div, urem, etc. because we may + // create undefined behavior when executing those on unknown vector elements. + if (!isSafeToSpeculativelyExecute(&I)) + return false; + + // Match a scalar binop with extracted vector operands: + // bo (extelt X, C0), (extelt Y, C1) + Instruction *Ext0, *Ext1; + if (!match(&I, m_BinOp(m_Instruction(Ext0), m_Instruction(Ext1)))) + return false; + + // TODO: Relax the one-use constraints by adjusting the cost calc. + Value *X, *Y; + uint64_t C0, C1; + if (!match(Ext0, m_OneUse(m_ExtractElement(m_Value(X), m_ConstantInt(C0)))) || + !match(Ext1, m_OneUse(m_ExtractElement(m_Value(Y), m_ConstantInt(C1))))) + return false; + + // Check if using a vector binop would be cheaper. + Instruction::BinaryOps BOpcode = cast(I).getOpcode(); + Type *ScalarTy = I.getType(); + Type *VecTy = X->getType(); + int ScalarBOCost = TTI.getArithmeticInstrCost(BOpcode, ScalarTy); + int VecBOCost = TTI.getArithmeticInstrCost(BOpcode, VecTy); + int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, + VecTy, C0); + int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, + VecTy, C1); + + // Handle a special case - if the extract indexes are the same, the + // replacement sequence does not require a shuffle. Unless the vector binop is + // much more expensive than the scalar binop, this eliminates an extract. + if (C0 == C1) { + int ScalarCost = Extract0Cost + Extract1Cost + ScalarBOCost; + int VecCost = VecBOCost + Extract0Cost; + if (ScalarCost <= VecCost) + return false; + + // bo (extelt X, C), (extelt Y, C) --> extelt (bo X, Y), C + ++NumVecBO; + IRBuilder<> Builder(&I); + Value *NewBO = Builder.CreateBinOp(BOpcode, X, Y); + if (auto *VecBOInst = dyn_cast(NewBO)) { + // All IR flags are safe to back-propagate because any potential poison + // created in unused vector elements is discarded by the extract. + VecBOInst->copyIRFlags(&I); + } + Value *Extract = Builder.CreateExtractElement(NewBO, Ext0->getOperand(1)); + I.replaceAllUsesWith(Extract); + return true; + } + + // TODO: Handle C0 != C1 by shuffling 1 of the operands. + return false; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. static bool runImpl(Function &F, const TargetTransformInfo &TTI, @@ -92,7 +153,7 @@ // iteratively in this loop rather than waiting until the end. for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldExtractCmp(I, TTI); - // TODO: More transforms go here. + MadeChange |= foldExtractBinop(I, TTI); } } Index: llvm/test/Transforms/VectorCombine/X86/extract-binop.ll =================================================================== --- llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- | FileCheck %s +; Eliminating extract is profitable. + define i8 @ext0_ext0_add(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext0_ext0_add( -; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]] -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: ret i8 [[TMP2]] ; %e0 = extractelement <16 x i8> %x, i32 0 %e1 = extractelement <16 x i8> %y, i32 0 @@ -14,12 +15,13 @@ ret i8 %r } +; Eliminating extract is still profitable. Flags propagate. + define i8 @ext1_ext1_add_flags(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext1_ext1_add_flags( -; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1 -; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[R:%.*]] = add nuw nsw i8 [[E0]], [[E1]] -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <16 x i8> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: ret i8 [[TMP2]] ; %e0 = extractelement <16 x i8> %x, i32 1 %e1 = extractelement <16 x i8> %y, i32 1 @@ -27,6 +29,8 @@ ret i8 %r } +; Negative test - eliminating extract is profitable, but vector shift is expensive. + define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext1_ext1_shl( ; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1 @@ -40,6 +44,8 @@ ret i8 %r } +; Negative test - eliminating extract is profitable, but vector multiply is expensive. + define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext13_ext13_mul( ; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13 @@ -53,6 +59,8 @@ ret i8 %r } +; Negative test - cost is irrelevant because sdiv has potential UB. + define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext0_ext0_sdiv( ; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0 @@ -66,6 +74,8 @@ ret i8 %r } +; Negative test - extracts are free and vector op has same cost as scalar. + define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @ext0_ext0_fadd( ; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0 @@ -79,12 +89,13 @@ ret double %r } +; Eliminating extract is profitable. Flags propagate. + define double @ext1_ext1_fsub(<2 x double> %x, <2 x double> %y) { ; CHECK-LABEL: @ext1_ext1_fsub( -; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[E1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1 -; CHECK-NEXT: [[R:%.*]] = fsub fast double [[E0]], [[E1]] -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub fast <2 x double> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: ret double [[TMP2]] ; %e0 = extractelement <2 x double> %x, i32 1 %e1 = extractelement <2 x double> %y, i32 1 @@ -92,6 +103,8 @@ ret double %r } +; TODO: Different extract indexes requires a shuffle. + define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext0_ext1_add( ; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0