Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -497,6 +497,14 @@ const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); + /// Same as above, except that the lane comes from a runtime value, and the + /// cloned instruction is returned instead of being directly stored into + /// the transform state. + Instruction * + scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, + unsigned Part, Value *Lane, bool IfPredicateInstr, + VPTransformState &State); + /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, VPTransformState &State); @@ -2779,6 +2787,70 @@ PredicatedInstructions.push_back(Cloned); } + +Instruction * +InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + VPReplicateRecipe *RepRecipe, + unsigned Part, Value *Lane, + bool IfPredicateInstr, + VPTransformState &State) { + assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + + // llvm.experimental.noalias.scope.decl intrinsics can just be dropped + // TODO: add special first lane handling + if (isa(Instr)) + return nullptr; + + // Does this instruction return a value ? + bool IsVoidRetTy = Instr->getType()->isVoidTy(); + + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); + + // If the scalarized instruction contributes to the address computation of a + // widen masked load/store which was in a basic block that needed predication + // and is not predicated after vectorization, we can't propagate + // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized + // instruction could feed a poison value to the base address of the widen + // load/store. + if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) + Cloned->dropPoisonGeneratingFlags(); + + if (Instr->getDebugLoc()) + State.setDebugLocFromInst(Instr); + + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (auto &I : enumerate(RepRecipe->operands())) { + VPValue *Operand = I.value(); + if (VPReplicateRecipe *OperandR = dyn_cast(Operand)) + if (OperandR->isUniform()) { + VPIteration First = {Part, VPLane::getFirstLane()}; + Cloned->setOperand(I.index(), State.get(Operand, First)); + continue; + } + auto *VecPart = State.get(Operand, Part); + auto *Extract = Builder.CreateExtractElement(VecPart, Lane); + Cloned->setOperand(I.index(), Extract); + } + State.addNewMetadata(Cloned, Instr); + + // Place the cloned scalar in the new loop. + State.Builder.Insert(Cloned); + + // If we just cloned a new assumption, add it the assumption cache. + if (auto *II = dyn_cast(Cloned)) + AC->registerAssumption(II); + + // End if-block. + if (IfPredicateInstr) + PredicatedInstructions.push_back(Cloned); + + return Cloned; +} + + Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) { if (TripCount) return TripCount; @@ -6791,6 +6863,13 @@ // Scalarization of fixed length vectors "just works". return true; + // If not predicated, we can now scalarize generically with a loop + // if needed. The remainder of the code below is about checking + // for cases we can scalarize with predication without hitting + // the generic replicate path which isn't yet implemented. + if (!foldTailByMasking()) + return true; + // For scalable vectors, a uniform memop load is always // uniform-by-parts and we know how to scalarize that. if (isa(I)) @@ -9597,6 +9676,67 @@ return; } + if (State.VF.isScalable()) { + // For scalable vectors, we can scalarize by using an inner loop to + // execute the statically unknown number of iterations required. + // TODO: This strategy could be used for long fixed length vectors if + // profitable. + // TODO: Instead of one sub-loop per part, we could use one loop + // processing lanes of each unrolled copy at once. + auto *Instr = getUnderlyingInstr(); + + auto &Builder = State.Builder; + Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + + auto *InsertPt = &*Builder.GetInsertPoint(); + BasicBlock *PreheaderBB = InsertPt->getParent(); + BasicBlock *HeaderBB = SplitBlock(InsertPt->getParent(), InsertPt); + BasicBlock *ExitBB = SplitBlock(InsertPt->getParent(), InsertPt); + + HeaderBB->getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(HeaderBB); + auto *IV = Builder.CreatePHI(RunTimeVF->getType(), 2); + IV->addIncoming(ConstantInt::get(RunTimeVF->getType(), 0), PreheaderBB); + PHINode *ResultIV = nullptr; + if (!Instr->getType()->isVoidTy()) { + auto *ResultTy = VectorType::get(Instr->getType(), State.VF); + ResultIV = Builder.CreatePHI(ResultTy, 2); + ResultIV->addIncoming(PoisonValue::get(ResultTy), PreheaderBB); + } + + Instruction *Cloned = + State.ILV->scalarizeInstruction(Instr, this, Part, IV, IsPredicated, + State); + + if (ResultIV) { + auto *Insert = Builder.CreateInsertElement(ResultIV, Cloned, IV); + ResultIV->addIncoming(Insert, HeaderBB); + State.set(this, Insert, Part); + } + + auto *Inc = Builder.CreateAdd(IV, ConstantInt::get(RunTimeVF->getType(), 1)); + IV->addIncoming(Inc, HeaderBB); + auto *Cmp = Builder.CreateICmpNE(IV, RunTimeVF); + Builder.CreateCondBr(Cmp, HeaderBB, ExitBB); + + // Update the state so that we can continue at the right point for the next + // recipe, and have valid analysis results once done transforming. + assert(State.CFG.VPBB2IRBB[getParent()].back() == PreheaderBB); + State.CFG.VPBB2IRBB[getParent()].push_back(HeaderBB); + State.CFG.VPBB2IRBB[getParent()].push_back(ExitBB); + State.CFG.PrevBB = ExitBB; + + assert(State.CurrentVectorLoop); + State.CurrentVectorLoop->addBasicBlockToLoop(HeaderBB, *State.LI); + State.CurrentVectorLoop->addBasicBlockToLoop(ExitBB, *State.LI); + + Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + } + return; + } + // Generate scalar instances for all VF lanes of all UF parts. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); const unsigned EndLane = State.VF.getKnownMinValue(); Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -797,6 +797,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB, BasicBlock *LoopLatchBB, BasicBlock *LoopExitBB) { + //LoopHeaderBB->getParent()->dump(); // The vector body may be more than a single basic-block by this point. // Update the dominator tree information inside the vector body by propagating // it from header to latch, expecting only triangular control-flow, if any. @@ -808,8 +809,13 @@ "Basic block in vector loop has more than 2 successors."); PostDomSucc = Succs[0]; if (Succs.size() == 1) { - assert(PostDomSucc->getSinglePredecessor() && - "PostDom successor has more than one predecessor."); + DT->addNewBlock(PostDomSucc, BB); + continue; + } + if (Succs[0] == BB || Succs[1] == BB) { + // simple one block loop - i.e. scalable scalarization + if (Succs[0] == BB) + PostDomSucc = Succs[1]; DT->addNewBlock(PostDomSucc, BB); continue; } Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -19,18 +19,29 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_SPLIT_SPLIT:%.*]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to * ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 2 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY_SPLIT:%.*]] +; CHECK: vector.body.split: +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ 0, [[VECTOR_BODY]] ], [ [[TMP13:%.*]], [[VECTOR_BODY_SPLIT]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[BROADCAST_SPLAT]], i32 [[TMP10]] +; CHECK-NEXT: store i16 [[TMP11]], i16* [[TMP12]], align 2 +; CHECK-NEXT: [[TMP13]] = add i32 [[TMP10]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP10]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY_SPLIT]], label [[VECTOR_BODY_SPLIT_SPLIT]] +; CHECK: vector.body.split.split: +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -853,15 +853,59 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; SCALABLE-LABEL: @uniform_store_of_loop_varying( ; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE: vector.ph: +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i32 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE: vector.body: +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_SPLIT_SPLIT:%.*]] ] +; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP3:%.*]] = add zeroinitializer, [[TMP2]] +; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT]], [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALABLE-NEXT: br label [[VECTOR_BODY_SPLIT:%.*]] +; SCALABLE: vector.body.split: +; SCALABLE-NEXT: [[TMP8:%.*]] = phi i32 [ 0, [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY_SPLIT]] ] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractelement [[TMP5]], i32 [[TMP8]] +; SCALABLE-NEXT: [[TMP10:%.*]] = extractelement [[BROADCAST_SPLAT]], i32 [[TMP8]] +; SCALABLE-NEXT: store i64 [[TMP9]], ptr [[TMP10]], align 8 +; SCALABLE-NEXT: [[TMP11]] = add i32 [[TMP8]], 1 +; SCALABLE-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP8]], [[TMP7]] +; SCALABLE-NEXT: br i1 [[TMP12]], label [[VECTOR_BODY_SPLIT]], label [[VECTOR_BODY_SPLIT_SPLIT]] +; SCALABLE: vector.body.split.split: +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT2]], ptr [[TMP14]], align 8 +; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE: middle.block: +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; SCALABLE: for.body: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; SCALABLE-NEXT: store i64 [[IV]], ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] -; SCALABLE-NEXT: store i64 [[V:%.*]], ptr [[ARRAYIDX]], align 8 +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: store i64 [[IV]], ptr [[B]], align 8 +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: for.end: ; SCALABLE-NEXT: ret void ; @@ -1159,7 +1203,7 @@ ; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1173,7 +1217,7 @@ ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: for.end: ; SCALABLE-NEXT: ret void ;