Index: llvm/trunk/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrMVE.td +++ llvm/trunk/lib/Target/ARM/ARMInstrMVE.td @@ -3986,6 +3986,7 @@ def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>; def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>; +let hasSideEffects = 1 in class MVE_VCTP size, list pattern=[]> : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix, "$Rn", vpred_n, "", pattern> { Index: llvm/trunk/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/MVETailPredication.cpp +++ llvm/trunk/lib/Target/ARM/MVETailPredication.cpp @@ -84,7 +84,7 @@ /// Is the icmp that generates an i1 vector, based upon a loop counter /// and a limit that is defined outside the loop. - bool isTailPredicate(Value *Predicate, Value *NumElements); + bool isTailPredicate(Instruction *Predicate, Value *NumElements); }; } // end namespace @@ -178,7 +178,7 @@ return Changed; } -bool MVETailPredication::isTailPredicate(Value *V, Value *NumElements) { +bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { // Look for the following: // %trip.count.minus.1 = add i32 %N, -1 @@ -206,7 +206,7 @@ Instruction *Induction = nullptr; // The vector icmp - if (!match(V, m_ICmp(Pred, m_Instruction(Induction), + if (!match(I, m_ICmp(Pred, m_Instruction(Induction), m_Instruction(Shuffle))) || Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle)) return false; @@ -390,6 +390,55 @@ return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); } +// Look through the exit block to see whether there's a duplicate predicate +// instruction. This can happen when we need to perform a select on values +// from the last and previous iteration. Instead of doing a straight +// replacement of that predicate with the vctp, clone the vctp and place it +// in the block. This means that the VPR doesn't have to be live into the +// exit block which should make it easier to convert this loop into a proper +// tail predicated loop. +static void Cleanup(DenseMap &NewPredicates, + SetVector &MaybeDead, Loop *L) { + if (BasicBlock *Exit = L->getUniqueExitBlock()) { + for (auto &Pair : NewPredicates) { + Instruction *OldPred = Pair.first; + Instruction *NewPred = Pair.second; + + for (auto &I : *Exit) { + if (I.isSameOperationAs(OldPred)) { + Instruction *PredClone = NewPred->clone(); + PredClone->insertBefore(&I); + I.replaceAllUsesWith(PredClone); + MaybeDead.insert(&I); + break; + } + } + } + } + + // Drop references and add operands to check for dead. + SmallPtrSet Dead; + while (!MaybeDead.empty()) { + auto *I = MaybeDead.front(); + MaybeDead.remove(I); + if (I->hasNUsesOrMore(1)) + continue; + + for (auto &U : I->operands()) { + if (auto *OpI = dyn_cast(U)) + MaybeDead.insert(OpI); + } + I->dropAllReferences(); + Dead.insert(I); + } + + for (auto *I : Dead) + I->eraseFromParent(); + + for (auto I : L->blocks()) + DeleteDeadPHIs(I); +} + bool MVETailPredication::TryConvert(Value *TripCount) { if (!IsPredicatedVectorLoop()) return false; @@ -400,13 +449,14 @@ // operand is generated from an induction variable. Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); - SmallPtrSet Predicates; + SetVector Predicates; + DenseMap NewPredicates; for (auto *I : MaskedInsts) { Intrinsic::ID ID = I->getIntrinsicID(); unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; - Value *Predicate = I->getArgOperand(PredOp); - if (Predicates.count(Predicate)) + auto *Predicate = dyn_cast(I->getArgOperand(PredOp)); + if (!Predicate || Predicates.count(Predicate)) continue; VectorType *VecTy = getVectorType(I); @@ -445,6 +495,7 @@ Value *Remaining = Builder.CreateSub(Processed, Factor); Value *TailPredicate = Builder.CreateCall(VCTP, Remaining); Predicate->replaceAllUsesWith(TailPredicate); + NewPredicates[Predicate] = cast(TailPredicate); // Add the incoming value to the new phi. Processed->addIncoming(Remaining, L->getLoopLatch()); @@ -453,9 +504,8 @@ << "TP: Inserted VCTP: " << *TailPredicate << "\n"); } - for (auto I : L->blocks()) - DeleteDeadPHIs(I); - + // Now clean up. + Cleanup(NewPredicates, Predicates, L); return true; } Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -0,0 +1,292 @@ +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s + +; CHECK-LABEL: vpsel_mul_reduce_add +; CHECK: dls lr, lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vstr p0, [sp +; CHECK: vpstt +; CHECK-NEXT: vldrwt.u32 +; CHECK-NEXT: vldrwt.u32 +; CHECK: vcmp.i32 +; CHECK: vpsel +; CHECK: vldr p0, [sp +; CHECK: vpst +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0] +; CHECK: le lr, [[LOOP]] +; CHECK: vctp.32 [[ELEMS]] +; CHECK-NEXT: vpsel +; CHECK-NEXT: vaddv.u32 +define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %tmp = getelementptr inbounds i32, i32* %a, i32 %index + %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp2 = bitcast i32* %tmp to <4 x i32>* + %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index + %tmp4 = bitcast i32* %tmp3 to <4 x i32>* + %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index + %tmp6 = bitcast i32* %tmp5 to <4 x i32>* + %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %rem = urem i32 %index, 16 + %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0 + %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %cmp = icmp eq <4 x i32> %rem.broadcast.splat, + %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c + %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a + %add = add nsw <4 x i32> %mul, %vec.phi + %index.next = add i32 %index, 4 + %tmp7 = icmp eq i32 %index.next, %n.vec + br i1 %tmp7, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi + %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: vpsel_mul_reduce_add_2 +; CHECK: dls lr, lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vstr p0, [sp +; CHECK: vpstt +; CHECK-NEXT: vldrwt.u32 +; CHECK-NEXT: vldrwt.u32 +; CHECK; vsub +; CHECK: vpst +; CHECK-NEXT: vldrwt.u32 +; CHECK: vcmp.i32 +; CHECK: vpsel +; CHECK: vldr p0, [sp +; CHECK: vpst +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0] +; CHECK: le lr, [[LOOP]] +; CHECK: vctp.32 [[ELEMS]] +; CHECK-NEXT: vpsel +; CHECK-NEXT: vaddv.u32 +define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, + i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %tmp = getelementptr inbounds i32, i32* %a, i32 %index + %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp2 = bitcast i32* %tmp to <4 x i32>* + %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index + %tmp4 = bitcast i32* %tmp3 to <4 x i32>* + %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index + %tmp6 = bitcast i32* %tmp5 to <4 x i32>* + %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index + %tmp8 = bitcast i32* %tmp7 to <4 x i32>* + %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d + %rem = urem i32 %index, 16 + %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0 + %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %cmp = icmp eq <4 x i32> %rem.broadcast.splat, + %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b + %mul = mul <4 x i32> %sel, %wide.masked.load.a + %add = add <4 x i32> %mul, %vec.phi + %index.next = add i32 %index, 4 + %cmp.exit = icmp eq i32 %index.next, %n.vec + br i1 %cmp.exit, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi + %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: and_mul_reduce_add +; CHECK: dls lr, lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpstt +; CHECK-NEXT: vldrwt.u32 +; CHECK-NEXT: vldrwt.u32 +; CHECK: vpsttt +; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3] +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2] +; CHECK: le lr, [[LOOP]] +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpsel +define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, + i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %tmp = getelementptr inbounds i32, i32* %a, i32 %index + %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp2 = bitcast i32* %tmp to <4 x i32>* + %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index + %tmp4 = bitcast i32* %tmp3 to <4 x i32>* + %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b + %cmp = icmp eq <4 x i32> %sub, + %mask = and <4 x i1> %cmp, %tmp1 + %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index + %tmp6 = bitcast i32* %tmp5 to <4 x i32>* + %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef) + %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index + %tmp8 = bitcast i32* %tmp7 to <4 x i32>* + %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef) + %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d + %add = add <4 x i32> %mul, %vec.phi + %index.next = add i32 %index, 4 + %cmp.exit = icmp eq i32 %index.next, %n.vec + br i1 %cmp.exit, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi + %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ] + ret i32 %res.0.lcssa +} + +; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd? +; CHECK-LABEL: or_mul_reduce_add +; CHECK: dls lr, lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vstr p0, [sp +; CHECK: vpstt +; CHECK-NEXT: vldrwt.u32 +; CHECK-NEXT: vldrwt.u32 +; CHECK: vcmp.i32 eq, {{.*}}, zr +; CHECK: vmrs [[VCMP:r[0-9]+]], p0 +; CHECK: vldr p0, [sp +; CHECK: vmrs [[VCTP:r[0-9]+]], p0 +; CHECK: orr{{.*}} [[VCMP]], [[VCTP]] +; CHECK-NEXT: vmsr p0 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3] +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2] +; CHECK: le lr, [[LOOP]] +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpsel +define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, + i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %tmp = getelementptr inbounds i32, i32* %a, i32 %index + %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %tmp2 = bitcast i32* %tmp to <4 x i32>* + %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index + %tmp4 = bitcast i32* %tmp3 to <4 x i32>* + %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef) + %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b + %cmp = icmp eq <4 x i32> %sub, + %mask = or <4 x i1> %cmp, %tmp1 + %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index + %tmp6 = bitcast i32* %tmp5 to <4 x i32>* + %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef) + %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index + %tmp8 = bitcast i32* %tmp7 to <4 x i32>* + %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef) + %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d + %add = add <4 x i32> %mul, %vec.phi + %index.next = add i32 %index, 4 + %cmp.exit = icmp eq i32 %index.next, %n.vec + br i1 %cmp.exit, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi + %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ] + ret i32 %res.0.lcssa +} + +; Function Attrs: argmemonly nounwind readonly willreturn +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + +; Function Attrs: nounwind readnone willreturn +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -0,0 +1,242 @@ +; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s + +; CHECK-LABEL: mul_reduce_add +; CHECK: dls lr, +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpstt +; CHECK-NEXT: vldrwt.u32 +; CHECK-NEXT: vldrwt.u32 +; CHECK: le lr, [[LOOP]] +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpsel +; CHECK: vaddv.u32 r0 +define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %0 = getelementptr inbounds i32, i32* %a, i32 %index + %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %2 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = getelementptr inbounds i32, i32* %b, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef) + %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load + %6 = add nsw <4 x i32> %5, %vec.phi + %index.next = add i32 %index, 4 + %7 = icmp eq i32 %index.next, %n.vec + br i1 %7, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi + %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ] + ret i32 %res.0.lcssa +} + +; Function Attrs: norecurse nounwind readonly +define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { +entry: + %cmp6 = icmp eq i32 %N, 0 + br i1 %cmp6, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %0 = getelementptr inbounds i32, i32* %a, i32 %index + %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %2 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi + %index.next = add i32 %index, 4 + %4 = icmp eq i32 %index.next, %n.vec + br i1 %4, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi + %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: add_reduce_add_const +; CHECK: dls lr, lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: subs [[ELEMS:r[0-9]+]], #4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpst +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0] +; CHECK: vadd.i32 +; CHECK: le lr, [[LOOP]] +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpsel +define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { +entry: + %cmp6 = icmp eq i32 %N, 0 + br i1 %cmp6, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %0 = getelementptr inbounds i32, i32* %a, i32 %index + %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 + %2 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi + %index.next = add i32 %index, 4 + %4 = icmp eq i32 %index.next, %n.vec + br i1 %4, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi + %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] + ret i32 %res.0.lcssa +} + +; CHECK-LABEL: vector_mul_const +; CHECK: dls lr, lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: subs [[ELEMS:r[0-9]+]], #4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpst +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1] +; CHECK: vmul.i32 +; CHECK: vpst +; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0] +; CHECK: le lr, [[LOOP]] +define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { +entry: + %cmp6 = icmp eq i32 %N, 0 + br i1 %cmp6, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer + %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 + %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %0 = getelementptr inbounds i32, i32* %b, i32 %index + %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %2 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 + %4 = getelementptr inbounds i32, i32* %a, i32 %index + %5 = bitcast i32* %4 to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n.vec + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +; CHECK-LABEL: vector_add_const +; CHECK: dls lr, lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: subs [[ELEMS:r[0-9]+]], #4 +; CHECK: vctp.32 [[ELEMS]] +; CHECK: vpst +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1] +; CHECK: vadd.i32 +; CHECK: vpst +; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0] +; CHECK: le lr, [[LOOP]] +define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { +entry: + %cmp6 = icmp eq i32 %N, 0 + br i1 %cmp6, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 3 + %n.vec = and i32 %n.rnd.up, -4 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer + %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 + %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %0 = getelementptr inbounds i32, i32* %b, i32 %index + %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 + %2 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) + %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 + %4 = getelementptr inbounds i32, i32* %a, i32 %index + %5 = bitcast i32* %4 to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n.vec + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4 +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -0,0 +1,75 @@ + +; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s + +; CHECK-LABEL: vec_mul_reduce_add + +; CHECK: vector.body: +; CHECK-NOT: phi i32 [ 0, %vector.ph ] +; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] +; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]]) +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]] +; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], + +; CHECK: middle.block: +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]]) +; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], +; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]]) + +define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { +entry: + %cmp8 = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ] + %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>* + %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>* + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %7 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef) + %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef) + %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load + %9 = add nsw <4 x i32> %8, %vec.phi + %index.next = add i32 %index, 4 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4 + %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %11 = icmp ne i32 %10, 0 + br i1 %11, label %vector.body, label %middle.block + +middle.block: ; preds = %vector.body + %12 = icmp ule <4 x i32> %induction, %broadcast.splat12 + %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi + %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13) + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ] + ret i32 %res.0.lcssa +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) +declare void @llvm.set.loop.iterations.i32(i32) +declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + Index: llvm/trunk/test/CodeGen/Thumb2/mve-vctp.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-vctp.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-vctp.ll @@ -4,8 +4,8 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) { ; CHECK-LABEL: vctp8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vctp.8 r0 +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r2] @@ -20,8 +20,8 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) { ; CHECK-LABEL: vctp16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r2] @@ -36,8 +36,8 @@ define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) { ; CHECK-LABEL: vctp32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r2]