diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" @@ -44,6 +45,7 @@ STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast"); STATISTIC(NumScalarBO, "Number of scalar binops formed"); STATISTIC(NumScalarCmp, "Number of scalar compares formed"); +STATISTIC(NumVecGEPsScalarized, "Number of vector GEP's that were scalarized"); static cl::opt DisableVectorCombine( "disable-vector-combine", cl::init(false), cl::Hidden, @@ -95,6 +97,7 @@ bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + bool scalarizeGEP(Instruction &I); void replaceValue(Value &Old, Value &New) { Old.replaceAllUsesWith(&New); @@ -1058,6 +1061,141 @@ return true; } +/// Try to scalarize vector GEP only uses of which are `extractelement`'s. +bool VectorCombine::scalarizeGEP(Instruction &I) { + auto *GEP = dyn_cast(&I); + if (!GEP) + return false; + + FixedVectorType *VecPtrTy = dyn_cast(I.getType()); + if (!VecPtrTy) + return false; + unsigned NumElts = VecPtrTy->getNumElements(); + Type *PointeeTy = GEP->getSourceElementType(); + + if (GEP->getNumIndices() != 1) + return false; // FIXME + + Value *GEPPointerOperand = GEP->getPointerOperand(); + Value *GEPIndice = *GEP->idx_begin(); + + constexpr const int SSONumLanes = 512 / 32; + SmallDenseMap, Value *, + SSONumLanes> + ScalarGEPCache; + struct ElementDetails { + SmallVector Uses; + Optional BasePtr; + Optional Index; + }; + + // Analyze uses of this GEP, they all should be `extractelement`'s, + // group them by the element index of the computed vector of addresses. + // FIXME: deal with non-constant indices? + SmallDenseMap GEPUses; + APInt GEPDemandedElts(NumElts, 0); + for (User *U : GEP->users()) { + auto *EEI = dyn_cast(U); + if (!EEI) + return false; + auto *IndexC = dyn_cast(EEI->getIndexOperand()); + if (!IndexC) + return false; // FIXME + GEPUses[IndexC->getZExtValue()].Uses.emplace_back(EEI); + GEPDemandedElts.setBit(IndexC->getZExtValue()); + } + + InstructionCost OriginalCost = + TTI.getGEPCost(PointeeTy, GEPPointerOperand, GEPIndice) + + TTI.getScalarizationOverhead(VecPtrTy, GEPDemandedElts, /*Insert=*/false, + /*Extract=*/true); + + APInt GEPPointerOperandDemandedElts(NumElts, 0); + APInt GEPIndiceDemandedElts(NumElts, 0); + for (auto &GEPVectorElement : GEPUses) { + unsigned IndexC = GEPVectorElement.first; + + auto FindScalarElement = [IndexC](Value *V) { + if (!V->getType()->isVectorTy()) + return V; + return findScalarElement(V, IndexC); + }; + + ElementDetails &Dsc = GEPVectorElement.second; + + if (Value *ScalarBasePtr = FindScalarElement(GEPPointerOperand)) + Dsc.BasePtr = ScalarBasePtr; + else + GEPPointerOperandDemandedElts.setBit(IndexC); + + if (Value *ScalarIndex = FindScalarElement(GEPIndice)) + Dsc.Index = ScalarIndex; + else + GEPIndiceDemandedElts.setBit(IndexC); + } + + InstructionCost ScalarizedCost = 0; + + if (!GEPPointerOperandDemandedElts.isNullValue()) + ScalarizedCost += TTI.getScalarizationOverhead( + cast(GEPPointerOperand->getType()), + GEPPointerOperandDemandedElts, /*Insert=*/false, + /*Extract=*/true); + if (!GEPIndiceDemandedElts.isNullValue()) + ScalarizedCost += TTI.getScalarizationOverhead( + cast(GEPIndice->getType()), GEPIndiceDemandedElts, + /*Insert=*/false, + /*Extract=*/true); + + SmallVector NewInstructions; + auto _ = make_scope_exit([&]() { + for (Instruction *I : reverse(NewInstructions)) + I->eraseFromParent(); + }); + + IRBuilder VindictiveBuilder( + I.getContext(), ConstantFolder(), + IRBuilderCallbackInserter( + [&](Instruction *I) { NewInstructions.push_back(I); })); + Builder.ClearInsertionPoint(); + VindictiveBuilder.SetInsertPoint(&I); + + for (auto &GEPVectorElement : GEPUses) { + unsigned IndexC = GEPVectorElement.first; + ElementDetails &Dsc = GEPVectorElement.second; + + if (!Dsc.BasePtr) + Dsc.BasePtr = + VindictiveBuilder.CreateExtractElement(GEPPointerOperand, IndexC); + + if (!Dsc.Index) + Dsc.Index = VindictiveBuilder.CreateExtractElement(GEPIndice, IndexC); + + Value *&ScalarGEP = ScalarGEPCache[{*Dsc.BasePtr, *Dsc.Index}]; + if (ScalarGEP) + continue; + ScalarizedCost += TTI.getGEPCost(PointeeTy, *Dsc.BasePtr, *Dsc.Index); + ScalarGEP = + VindictiveBuilder.CreateGEP(PointeeTy, *Dsc.BasePtr, *Dsc.Index); + if (auto *ScalarGEPInst = dyn_cast(ScalarGEP)) + ScalarGEPInst->setIsInBounds(GEP->isInBounds()); + } + + // Prefer to scalarize if cost is equal. + if (OriginalCost < ScalarizedCost) + return false; + NewInstructions.clear(); + + for (ElementDetails &Dsc : make_second_range(GEPUses)) { + Value &ScalarGEP = *ScalarGEPCache[{*Dsc.BasePtr, *Dsc.Index}]; + for (auto *GEPUser : Dsc.Uses) + replaceValue(*GEPUser, ScalarGEP); + } + + ++NumVecGEPsScalarized; + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -1078,6 +1216,7 @@ MadeChange |= foldExtractedCmps(I); MadeChange |= scalarizeLoadExtract(I); MadeChange |= foldSingleElementStore(I); + MadeChange |= scalarizeGEP(I); }; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll --- a/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll +++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll @@ -87,10 +87,11 @@ define void @indicies_need_extraction.2elts(i64* %baseptr, <2 x i64> %indices) { ; CHECK-LABEL: @indicies_need_extraction.2elts( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR:%.*]], <2 x i64> [[INDICES:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <2 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[INDICES:%.*]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[INDICES]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR]], i64 [[TMP2]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <2 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) ; CHECK-NEXT: ret void ; @@ -107,12 +108,14 @@ define void @indicies_need_extraction.3elts(i64* %baseptr, <3 x i64> %indices) { ; CHECK-LABEL: @indicies_need_extraction.3elts( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR:%.*]], <3 x i64> [[INDICES:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[INDICES:%.*]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[INDICES]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64> [[INDICES]], i64 2 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR]], i64 [[TMP3]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 ; CHECK-NEXT: call void @use(i64* [[PTR_2]]) ; CHECK-NEXT: ret void ; @@ -132,14 +135,17 @@ define void @indicies_need_extraction.4elts(i64* %baseptr, <4 x i64> %indices) { ; CHECK-LABEL: @indicies_need_extraction.4elts( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR:%.*]], <4 x i64> [[INDICES:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[INDICES:%.*]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[INDICES]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[INDICES]], i64 2 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[INDICES]], i64 3 +; CHECK-NEXT: [[PTR_3:%.*]] = getelementptr inbounds i64, i64* [[BASEPTR]], i64 [[TMP4]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 2 ; CHECK-NEXT: call void @use(i64* [[PTR_2]]) -; CHECK-NEXT: [[PTR_3:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 3 ; CHECK-NEXT: call void @use(i64* [[PTR_3]]) ; CHECK-NEXT: ret void ; @@ -164,10 +170,11 @@ define void @baseptrs_need_extraction.2elts(<2 x i64*> %baseptrs, i64 %indice) { ; CHECK-LABEL: @baseptrs_need_extraction.2elts( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <2 x i64*> [[BASEPTRS:%.*]], i64 [[INDICE:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <2 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64*> [[BASEPTRS:%.*]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i64 [[INDICE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64*> [[BASEPTRS]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i64 [[INDICE]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <2 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) ; CHECK-NEXT: ret void ; @@ -184,12 +191,14 @@ define void @baseptrs_need_extraction.3elts(<3 x i64*> %baseptrs, i64 %indice) { ; CHECK-LABEL: @baseptrs_need_extraction.3elts( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS:%.*]], i64 [[INDICE:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64*> [[BASEPTRS:%.*]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i64 [[INDICE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64*> [[BASEPTRS]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i64 [[INDICE]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i64*> [[BASEPTRS]], i64 2 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP3]], i64 [[INDICE]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 ; CHECK-NEXT: call void @use(i64* [[PTR_2]]) ; CHECK-NEXT: ret void ; @@ -209,14 +218,17 @@ define void @baseptrs_need_extraction.4elts(<4 x i64*> %baseptrs, i64 %indice) { ; CHECK-LABEL: @baseptrs_need_extraction.4elts( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <4 x i64*> [[BASEPTRS:%.*]], i64 [[INDICE:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64*> [[BASEPTRS:%.*]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i64 [[INDICE:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64*> [[BASEPTRS]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i64 [[INDICE]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64*> [[BASEPTRS]], i64 2 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP3]], i64 [[INDICE]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64*> [[BASEPTRS]], i64 3 +; CHECK-NEXT: [[PTR_3:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i64 [[INDICE]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 2 ; CHECK-NEXT: call void @use(i64* [[PTR_2]]) -; CHECK-NEXT: [[PTR_3:%.*]] = extractelement <4 x i64*> [[PTRS]], i64 3 ; CHECK-NEXT: call void @use(i64* [[PTR_3]]) ; CHECK-NEXT: ret void ; @@ -287,10 +299,11 @@ ; CHECK-LABEL: @first_indice_and_baseptr_are_known.2elts( ; CHECK-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <2 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 ; CHECK-NEXT: [[INDICES_NEW:%.*]] = insertelement <2 x i64> [[INDICES:%.*]], i64 [[SECOND_INDICE:%.*]], i64 0 -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <2 x i64*> [[BASEPTRS_NEW]], <2 x i64> [[INDICES_NEW]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <2 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[SECOND_INDICE]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64*> [[BASEPTRS_NEW]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[INDICES_NEW]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i64 [[TMP2]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <2 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) ; CHECK-NEXT: ret void ; @@ -395,17 +408,32 @@ ;------------------------------------------------------------------------------- define void @first_two_baseptrs_is_known.3elts(<3 x i64*> %baseptrs, i64* %second_baseptr, i64* %third_baseptr, <3 x i64> %indices) { -; CHECK-LABEL: @first_two_baseptrs_is_known.3elts( -; CHECK-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 -; CHECK-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[THIRD_BASEPTR:%.*]], i64 1 -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 -; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 -; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 -; CHECK-NEXT: call void @use(i64* [[PTR_2]]) -; CHECK-NEXT: ret void +; SSE-LABEL: @first_two_baseptrs_is_known.3elts( +; SSE-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 +; SSE-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[THIRD_BASEPTR:%.*]], i64 1 +; SSE-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[INDICES:%.*]], i64 0 +; SSE-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[TMP1]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[INDICES]], i64 1 +; SSE-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[THIRD_BASEPTR]], i64 [[TMP2]] +; SSE-NEXT: [[TMP3:%.*]] = extractelement <3 x i64*> [[BASEPTRS_NEW]], i64 2 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <3 x i64> [[INDICES]], i64 2 +; SSE-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP3]], i64 [[TMP4]] +; SSE-NEXT: call void @use(i64* [[PTR_0]]) +; SSE-NEXT: call void @use(i64* [[PTR_1]]) +; SSE-NEXT: call void @use(i64* [[PTR_2]]) +; SSE-NEXT: ret void +; +; AVX-LABEL: @first_two_baseptrs_is_known.3elts( +; AVX-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 +; AVX-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[THIRD_BASEPTR:%.*]], i64 1 +; AVX-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES:%.*]] +; AVX-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 +; AVX-NEXT: call void @use(i64* [[PTR_0]]) +; AVX-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 +; AVX-NEXT: call void @use(i64* [[PTR_1]]) +; AVX-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 +; AVX-NEXT: call void @use(i64* [[PTR_2]]) +; AVX-NEXT: ret void ; %baseptrs.new.tmp = insertelement <3 x i64*> %baseptrs, i64* %second_baseptr, i64 0 %baseptrs.new = insertelement <3 x i64*> %baseptrs.new.tmp, i64* %third_baseptr, i64 1 @@ -458,12 +486,13 @@ ; CHECK-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[THIRD_BASEPTR:%.*]], i64 1 ; CHECK-NEXT: [[INDICES_NEW_TMP:%.*]] = insertelement <3 x i64> [[INDICES:%.*]], i64 [[SECOND_INDICE:%.*]], i64 0 ; CHECK-NEXT: [[INDICES_NEW:%.*]] = insertelement <3 x i64> [[INDICES_NEW_TMP]], i64 [[THIRD_INDICE:%.*]], i64 1 -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES_NEW]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 +; CHECK-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[SECOND_INDICE]] +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[THIRD_BASEPTR]], i64 [[THIRD_INDICE]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64*> [[BASEPTRS_NEW]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[INDICES_NEW]], i64 2 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i64 [[TMP2]] ; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 ; CHECK-NEXT: call void @use(i64* [[PTR_2]]) ; CHECK-NEXT: ret void ; @@ -488,17 +517,32 @@ ;------------------------------------------------------------------------------- define void @first_two_baseptrs_is_knownequal.3elts(<3 x i64*> %baseptrs, i64* %second_baseptr, <3 x i64> %indices) { -; CHECK-LABEL: @first_two_baseptrs_is_knownequal.3elts( -; CHECK-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 -; CHECK-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 -; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 -; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 -; CHECK-NEXT: call void @use(i64* [[PTR_2]]) -; CHECK-NEXT: ret void +; SSE-LABEL: @first_two_baseptrs_is_knownequal.3elts( +; SSE-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 +; SSE-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 +; SSE-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[INDICES:%.*]], i64 0 +; SSE-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[TMP1]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[INDICES]], i64 1 +; SSE-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[TMP2]] +; SSE-NEXT: [[TMP3:%.*]] = extractelement <3 x i64*> [[BASEPTRS_NEW]], i64 2 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <3 x i64> [[INDICES]], i64 2 +; SSE-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP3]], i64 [[TMP4]] +; SSE-NEXT: call void @use(i64* [[PTR_0]]) +; SSE-NEXT: call void @use(i64* [[PTR_1]]) +; SSE-NEXT: call void @use(i64* [[PTR_2]]) +; SSE-NEXT: ret void +; +; AVX-LABEL: @first_two_baseptrs_is_knownequal.3elts( +; AVX-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 +; AVX-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 +; AVX-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES:%.*]] +; AVX-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 +; AVX-NEXT: call void @use(i64* [[PTR_0]]) +; AVX-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 +; AVX-NEXT: call void @use(i64* [[PTR_1]]) +; AVX-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 +; AVX-NEXT: call void @use(i64* [[PTR_2]]) +; AVX-NEXT: ret void ; %baseptrs.new.tmp = insertelement <3 x i64*> %baseptrs, i64* %second_baseptr, i64 0 %baseptrs.new = insertelement <3 x i64*> %baseptrs.new.tmp, i64* %second_baseptr, i64 1 @@ -551,12 +595,12 @@ ; CHECK-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 ; CHECK-NEXT: [[INDICES_NEW_TMP:%.*]] = insertelement <3 x i64> [[INDICES:%.*]], i64 [[SECOND_INDICE:%.*]], i64 0 ; CHECK-NEXT: [[INDICES_NEW:%.*]] = insertelement <3 x i64> [[INDICES_NEW_TMP]], i64 [[SECOND_INDICE]], i64 1 -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES_NEW]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 -; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[SECOND_INDICE]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64*> [[BASEPTRS_NEW]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[INDICES_NEW]], i64 2 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i64 [[TMP2]] +; CHECK-NEXT: call void @use(i64* [[PTR_1]]) ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 ; CHECK-NEXT: call void @use(i64* [[PTR_2]]) ; CHECK-NEXT: ret void ; @@ -681,17 +725,32 @@ ;------------------------------------------------------------------------------- define void @first_two_baseptrs_is_knownequal.4elts(<3 x i64*> %baseptrs, i64* %second_baseptr, <3 x i64> %indices) { -; CHECK-LABEL: @first_two_baseptrs_is_knownequal.4elts( -; CHECK-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 -; CHECK-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES:%.*]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 -; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 -; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 -; CHECK-NEXT: call void @use(i64* [[PTR_2]]) -; CHECK-NEXT: ret void +; SSE-LABEL: @first_two_baseptrs_is_knownequal.4elts( +; SSE-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 +; SSE-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 +; SSE-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[INDICES:%.*]], i64 0 +; SSE-NEXT: [[PTR_0:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[TMP1]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[INDICES]], i64 1 +; SSE-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[TMP2]] +; SSE-NEXT: [[TMP3:%.*]] = extractelement <3 x i64*> [[BASEPTRS_NEW]], i64 2 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <3 x i64> [[INDICES]], i64 2 +; SSE-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP3]], i64 [[TMP4]] +; SSE-NEXT: call void @use(i64* [[PTR_0]]) +; SSE-NEXT: call void @use(i64* [[PTR_1]]) +; SSE-NEXT: call void @use(i64* [[PTR_2]]) +; SSE-NEXT: ret void +; +; AVX-LABEL: @first_two_baseptrs_is_knownequal.4elts( +; AVX-NEXT: [[BASEPTRS_NEW_TMP:%.*]] = insertelement <3 x i64*> [[BASEPTRS:%.*]], i64* [[SECOND_BASEPTR:%.*]], i64 0 +; AVX-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 +; AVX-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES:%.*]] +; AVX-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 +; AVX-NEXT: call void @use(i64* [[PTR_0]]) +; AVX-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 +; AVX-NEXT: call void @use(i64* [[PTR_1]]) +; AVX-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 +; AVX-NEXT: call void @use(i64* [[PTR_2]]) +; AVX-NEXT: ret void ; %baseptrs.new.tmp = insertelement <3 x i64*> %baseptrs, i64* %second_baseptr, i64 0 %baseptrs.new = insertelement <3 x i64*> %baseptrs.new.tmp, i64* %second_baseptr, i64 1 @@ -744,12 +803,12 @@ ; CHECK-NEXT: [[BASEPTRS_NEW:%.*]] = insertelement <3 x i64*> [[BASEPTRS_NEW_TMP]], i64* [[SECOND_BASEPTR]], i64 1 ; CHECK-NEXT: [[INDICES_NEW_TMP:%.*]] = insertelement <3 x i64> [[INDICES:%.*]], i64 [[SECOND_INDICE:%.*]], i64 0 ; CHECK-NEXT: [[INDICES_NEW:%.*]] = insertelement <3 x i64> [[INDICES_NEW_TMP]], i64 [[SECOND_INDICE]], i64 1 -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i64, <3 x i64*> [[BASEPTRS_NEW]], <3 x i64> [[INDICES_NEW]] -; CHECK-NEXT: [[PTR_0:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 0 -; CHECK-NEXT: call void @use(i64* [[PTR_0]]) -; CHECK-NEXT: [[PTR_1:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 1 +; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr inbounds i64, i64* [[SECOND_BASEPTR]], i64 [[SECOND_INDICE]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64*> [[BASEPTRS_NEW]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i64> [[INDICES_NEW]], i64 2 +; CHECK-NEXT: [[PTR_2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i64 [[TMP2]] +; CHECK-NEXT: call void @use(i64* [[PTR_1]]) ; CHECK-NEXT: call void @use(i64* [[PTR_1]]) -; CHECK-NEXT: [[PTR_2:%.*]] = extractelement <3 x i64*> [[PTRS]], i64 2 ; CHECK-NEXT: call void @use(i64* [[PTR_2]]) ; CHECK-NEXT: ret void ;