Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -555,7 +555,7 @@ /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + int getTreeCost(bool GatherLoad=false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -2321,7 +2321,7 @@ return Cost; } -int BoUpSLP::getTreeCost() { +int BoUpSLP::getTreeCost(bool GatherLoads) { int Cost = 0; DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -2348,6 +2348,12 @@ if (EphValues.count(EU.User)) continue; + // Users of the roots have been vectorized already, the new extractelement + // instructions are canceled out by already added insertelement + // instructions. + if (GatherLoads) + continue; + // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. @@ -3315,7 +3321,7 @@ } bool vectorizeAccessChain(ArrayRef Chain, BoUpSLP &R, - unsigned VecRegSize) { + unsigned VecRegSize, bool GatherOpt) { assert(!Chain.empty() && (isa(Chain[0]) || isa(Chain[0])) && "Chain has to be non-empty and contain load or store instructions"); @@ -3354,7 +3360,7 @@ R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + int Cost = R.getTreeCost(GatherOpt); DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { @@ -3387,7 +3393,8 @@ static bool vectorizeAccesses(ArrayRef Accesses, BoUpSLP &R, - const DataLayout *DL, ScalarEvolution *SE) { + const DataLayout *DL, ScalarEvolution *SE, + bool GatherOpt) { SetVector Heads, Tails; SmallDenseMap ConsecutiveChain; @@ -3445,7 +3452,7 @@ // register size is a power-of-2? for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) { - if (vectorizeAccessChain(Operands, R, Size)) { + if (vectorizeAccessChain(Operands, R, Size, GatherOpt)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedAccesses.insert(Operands.begin(), Operands.end()); Changed = true; @@ -3462,6 +3469,7 @@ DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. + SmallPtrSet GatherLoads; for (Instruction *it : GatherSeq) { InsertElementInst *Insert = dyn_cast(it); @@ -3469,6 +3477,11 @@ if (!Insert) continue; + // Collect scalar loads, which can potentially be combined to vector + // loads. + if (LoadInst *LI = dyn_cast(Insert->getOperand(1))) + GatherLoads.insert(LI); + // Check if this block is inside a loop. Loop *L = LI->getLoopFor(Insert->getParent()); if (!L) @@ -3493,6 +3506,11 @@ Insert->moveBefore(PreHeader->getTerminator()); } + if (!GatherLoads.empty()) { + SmallVector LoadV(GatherLoads.begin(), GatherLoads.end()); + vectorizeAccesses(LoadV, *this, DL, SE, true); + } + // Make a list of all reachable blocks in our CSE queue. SmallVector CSEWorkList; CSEWorkList.reserve(CSEBlocks.size()); @@ -4389,6 +4407,7 @@ } if (Changed) { + R.deleteTree(); R.optimizeGatherSequence(); DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); DEBUG(verifyFunction(F)); @@ -5882,7 +5901,7 @@ for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min(CE - CI, 16); Changed |= vectorizeAccesses(makeArrayRef(&it->second[CI], Len), R, DL, - SE); + SE, false); } } return Changed; Index: test/Transforms/SLPVectorizer/AArch64/merge-gather-loads.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/merge-gather-loads.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -instcombine -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @test1 +; CHECK: [[BC:[a-z0-9]+]] = bitcast i32* %b to <2 x i32>* +; CHECK: [[MERGED_LOAD:[a-z0-9]+]] = load <2 x i32>, <2 x i32>* %[[BC]], align 4 +; CHECK: = shufflevector <2 x i32> %[[MERGED_LOAD]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK: = shufflevector <2 x i32> %[[MERGED_LOAD]], <2 x i32> undef, <2 x i32> + +define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c) { +entry: + %scalar.1 = load i32, i32* %b, align 4 + %v1.1 = load i32, i32* %c, align 4 + %add = mul nsw i32 %v1.1, %scalar.1 + store i32 %add, i32* %a, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 1 + %v1.2 = load i32, i32* %arrayidx3, align 4 + %add5 = mul nsw i32 %v1.2, %scalar.1 + %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1 + store i32 %add5, i32* %arrayidx7, align 4 + + %si.2 = getelementptr inbounds i32, i32* %b, i64 1 + %scalar.2 = load i32, i32* %si.2, align 4 + + %arrayidx8 = getelementptr inbounds i32, i32* %c, i64 2 + %v1.3 = load i32, i32* %arrayidx8, align 4 + %add10 = mul nsw i32 %v1.3, %scalar.2 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2 + store i32 %add10, i32* %arrayidx12, align 4 + %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 3 + %v1.4 = load i32, i32* %arrayidx13, align 4 + %add15 = mul nsw i32 %v1.4, %scalar.2 + %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3 + store i32 %add15, i32* %arrayidx17, align 4 + + ret void +} + +define void @test_add(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c) { +entry: + %scalar.1 = load i32, i32* %b, align 4 + %v1.1 = load i32, i32* %c, align 4 + %add = add nsw i32 %v1.1, %scalar.1 + store i32 %add, i32* %a, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 1 + %v1.2 = load i32, i32* %arrayidx3, align 4 + %add5 = add nsw i32 %v1.2, %scalar.1 + %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1 + store i32 %add5, i32* %arrayidx7, align 4 + + %si.2 = getelementptr inbounds i32, i32* %b, i64 1 + %scalar.2 = load i32, i32* %si.2, align 4 + + %arrayidx8 = getelementptr inbounds i32, i32* %c, i64 2 + %v1.3 = load i32, i32* %arrayidx8, align 4 + %add10 = add nsw i32 %v1.3, %scalar.2 + %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2 + store i32 %add10, i32* %arrayidx12, align 4 + %arrayidx13 = getelementptr inbounds i32, i32* %c, i64 3 + %v1.4 = load i32, i32* %arrayidx13, align 4 + %add15 = add nsw i32 %v1.4, %scalar.2 + %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3 + store i32 %add15, i32* %arrayidx17, align 4 + + ret void +} Index: test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -11,20 +11,21 @@ define i32 @fn1() { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP2]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 8, i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: Index: test/Transforms/SLPVectorizer/X86/operandorder.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/operandorder.ll +++ test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -25,8 +25,9 @@ } ; CHECK-LABEL: shuffle_preserve_broadcast -; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: bitcast double* %from to <2 x double>* +; CHECK: %[[MERGED_LOAD:[a-z0-9]+]] = load <2 x double>, <2 x double>* %0, align 4 +; CHECK: shufflevector <2 x double> %[[MERGED_LOAD]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -50,8 +51,11 @@ } ; CHECK-LABEL: shuffle_preserve_broadcast2 -; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: bitcast double* %from to <2 x double>* +; CHECK: %[[MERGED_LOAD:[a-z0-9]+]] = load <2 x double>, <2 x double>* %0, align 4 +; CHECK: %[[P_INSERT:[a-z0-9]+]] = insertelement <2 x double> undef, double %p, i32 0 +; CHECK: = shufflevector <2 x double> %[[P_INSERT]], <2 x double> %[[MERGED_LOAD]], <2 x i32> +; CHECK: = shufflevector <2 x double> %[[MERGED_LOAD]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast2(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -75,8 +79,8 @@ } ; CHECK-LABEL: shuffle_preserve_broadcast3 -; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: %[[MERGED_LOAD:[a-z0-9]+]] = load <2 x double>, <2 x double>* %0, align 4 +; CHECK: = shufflevector <2 x double> %[[MERGED_LOAD]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast3(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -101,8 +105,8 @@ ; CHECK-LABEL: shuffle_preserve_broadcast4 -; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: %[[MERGED_LOAD:[a-z0-9]+]] = load <2 x double>, <2 x double>* %0, align 4 +; CHECK: = shufflevector <2 x double> %[[MERGED_LOAD]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast4(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -126,8 +130,8 @@ } ; CHECK-LABEL: shuffle_preserve_broadcast5 -; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: %[[MERGED_LOAD:[a-z0-9]+]] = load <2 x double>, <2 x double>* %0, align 4 +; CHECK: = shufflevector <2 x double> %[[MERGED_LOAD]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast5(double * noalias %from, double * noalias %to, double %v1, double %v2) { @@ -152,8 +156,8 @@ ; CHECK-LABEL: shuffle_preserve_broadcast6 -; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1 -; CHECK: = shufflevector <2 x double> %[[BCAST]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: %[[MERGED_LOAD:[a-z0-9]+]] = load <2 x double>, <2 x double>* %0, align 4 +; CHECK: = shufflevector <2 x double> %[[MERGED_LOAD]], <2 x double> undef, <2 x i32> zeroinitializer define void @shuffle_preserve_broadcast6(double * noalias %from, double * noalias %to, double %v1, double %v2) {