diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9343,6 +9343,7 @@ VPlanTransforms::sinkScalarOperands(*Plan); VPlanTransforms::mergeReplicateRegions(*Plan); + VPlanTransforms::optimizeRecipes(*Plan); std::string PlanName; raw_string_ostream RSO(PlanName); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -32,6 +32,9 @@ static bool sinkScalarOperands(VPlan &Plan); static bool mergeReplicateRegions(VPlan &Plan); + + /// Apply local optimizations to the recipes in \p Plan. + static void optimizeRecipes(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -282,3 +282,39 @@ delete ToDelete; return Changed; } + +void VPlanTransforms::optimizeRecipes(VPlan &Plan) { + auto Iter = depth_first( + VPBlockRecursiveTraversalWrapper(Plan.getEntry())); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { + for (auto &Recipe : make_early_inc_range(*VPBB)) { + // Try to widen replicate recipes, if it the recipes has users that use + // the vector value. + if (auto *RepR = dyn_cast(&Recipe)) { + if (RepR->isPredicated() || RepR->isUniform()) + continue; + + if (!isa(RepR->getUnderlyingValue())) + continue; + + auto IsUsedAsVector = [RepR](VPUser *U) { + auto *UI = dyn_cast(U); + if (!UI) + return false; + if (auto *Mem = dyn_cast(UI)) + return Mem->getNumOperands() > 1 && RepR == Mem->getOperand(1); + return isa(UI); + }; + if (!any_of(RepR->users(), IsUsedAsVector)) + continue; + + auto *W = new VPWidenGEPRecipe( + cast(RepR->getUnderlyingValue()), + RepR->operands()); + W->insertBefore(RepR); + RepR->replaceAllUsesWith(W); + RepR->eraseFromParent(); + } + } + } +} diff --git a/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll b/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll --- a/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll +++ b/llvm/test/Transforms/LoopVectorize/gep-used-as-scalar-and-vector.ll @@ -17,23 +17,20 @@ ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* %src, i64 [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX1]], 1 ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* %src, i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP6]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP7]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i8*> poison, i8* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i8*> [[TMP6]], i8* [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8** [[TMP8]] to <2 x i8*>* -; CHECK-NEXT: store <2 x i8*> [[TMP7]], <2 x i8*>* [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[NEXT_GEP6]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <2 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP11]], align 1 -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i8> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP10]] to <2 x i8>* -; CHECK-NEXT: store <2 x i8> [[TMP12]], <2 x i8>* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP6]], <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8** [[TMP5]] to <2 x i8*>* +; CHECK-NEXT: store <2 x i8*> [[TMP4]], <2 x i8*>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[NEXT_GEP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP8]], align 1 +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>* +; CHECK-NEXT: store <2 x i8> [[TMP9]], <2 x i8>* [[TMP10]], align 1 ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], -; CHECK-NEXT: br i1 [[TMP14]], label %middle.block, label %vector.body -; +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT2]], +; CHECK-NEXT: br i1 [[TMP11]], label %middle.block, label %vector.body + entry: br label %loop.body