Index: llvm/trunk/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/MVETailPredication.cpp +++ llvm/trunk/lib/Target/ARM/MVETailPredication.cpp @@ -491,13 +491,13 @@ case 16: VCTPID = Intrinsic::arm_vctp8; break; } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - // TODO: This add likely already exists in the loop. - Value *Remaining = Builder.CreateSub(Processed, Factor); - Value *TailPredicate = Builder.CreateCall(VCTP, Remaining); + Value *TailPredicate = Builder.CreateCall(VCTP, Processed); Predicate->replaceAllUsesWith(TailPredicate); NewPredicates[Predicate] = cast(TailPredicate); // Add the incoming value to the new phi. + // TODO: This add likely already exists in the loop. + Value *Remaining = Builder.CreateSub(Processed, Factor); Processed->addIncoming(Remaining, L->getLoopLatch()); LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: " << *Processed << "\n" Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -4,8 +4,8 @@ ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 -; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[REMAINING]]) ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) ; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]]) @@ -57,8 +57,8 @@ ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[REMAINING]]) ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]]) @@ -109,8 +109,8 @@ ; CHECK-LABEL: mul_v4i32 ; CHECK: vector.body: ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]]) ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) @@ -162,8 +162,8 @@ ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2 -; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[REMAINING]]) ; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef) ; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]]) define void @copy_v2i64(i64* %a, i64* %b, i32 %N) { @@ -210,8 +210,8 @@ ; CHECK: vector.body: ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]]) ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) @@ -268,8 +268,8 @@ ; One of the loads now uses ult predicate. ; CHECK-LABEL: mismatch_load_pred ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]]) ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) @@ -322,8 +322,8 @@ ; CHECK-LABEL: mismatch_store_pred ; CHECK: %index = phi i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]]) ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]]) ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong) Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -3,8 +3,8 @@ ; CHECK-LABEL: vpsel_mul_reduce_add ; CHECK: dls lr, lr ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] +; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] ; CHECK: vstr p0, [sp ; CHECK: vpstt ; CHECK-NEXT: vldrwt.u32 @@ -14,8 +14,9 @@ ; CHECK: vldr p0, [sp ; CHECK: vpst ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0] +; CHECK: sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4 ; CHECK: le lr, [[LOOP]] -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS_OUT]] ; CHECK-NEXT: vpsel ; CHECK-NEXT: vaddv.u32 define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) { @@ -71,8 +72,8 @@ ; CHECK-LABEL: vpsel_mul_reduce_add_2 ; CHECK: dls lr, lr ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] +; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] ; CHECK: vstr p0, [sp ; CHECK: vpstt ; CHECK-NEXT: vldrwt.u32 @@ -85,8 +86,9 @@ ; CHECK: vldr p0, [sp ; CHECK: vpst ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0] +; CHECK: sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4 ; CHECK: le lr, [[LOOP]] -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS_OUT]] ; CHECK-NEXT: vpsel ; CHECK-NEXT: vaddv.u32 define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, @@ -147,17 +149,18 @@ ; CHECK-LABEL: and_mul_reduce_add ; CHECK: dls lr, lr ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] ; CHECK: vpstt ; CHECK-NEXT: vldrwt.u32 ; CHECK-NEXT: vldrwt.u32 +; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] ; CHECK: vpsttt ; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3] ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2] +; CHECK: sub{{.*}} [[ELEMS]],{{.*}}#4 ; CHECK: le lr, [[LOOP]] -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS_OUT]] ; CHECK: vpsel define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { @@ -215,9 +218,9 @@ ; CHECK-LABEL: or_mul_reduce_add ; CHECK: dls lr, lr ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] ; CHECK: vstr p0, [sp +; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] ; CHECK: vpstt ; CHECK-NEXT: vldrwt.u32 ; CHECK-NEXT: vldrwt.u32 @@ -226,12 +229,13 @@ ; CHECK: vldr p0, [sp ; CHECK: vmrs [[VCTP:r[0-9]+]], p0 ; CHECK: orr{{.*}} [[VCMP]], [[VCTP]] +; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], [[ELEMS_OUT]], #4 ; CHECK-NEXT: vmsr p0 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3] ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2] ; CHECK: le lr, [[LOOP]] -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS_OUT]] ; CHECK: vpsel define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -500,16 +500,17 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r1] ; CHECK-NEXT: adds r1, #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} @@ -607,7 +608,6 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vadd.i32 q4, q1, r4 ; CHECK-NEXT: @ implicit-def: $q5 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vcmp.u32 cs, q0, q4 ; CHECK-NEXT: @ implicit-def: $q4 ; CHECK-NEXT: vmrs r6, p0 @@ -681,6 +681,7 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q4, [r3] ; CHECK-NEXT: adds r3, #16 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: le lr, .LBB5_5 ; CHECK-NEXT: b .LBB5_12 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new @@ -903,10 +904,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vadd.i32 q2, q1, r4 ; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vcmp.u32 cs, q0, q2 ; CHECK-NEXT: @ implicit-def: $q2 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vmrs r6, p0 ; CHECK-NEXT: and r5, r6, #1 ; CHECK-NEXT: rsbs r7, r5, #0 @@ -977,6 +977,7 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: le lr, .LBB6_2 ; CHECK-NEXT: .LBB6_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 @@ -1084,7 +1085,6 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vadd.i32 q4, q1, r4 ; CHECK-NEXT: @ implicit-def: $q5 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vcmp.u32 cs, q0, q4 ; CHECK-NEXT: @ implicit-def: $q4 ; CHECK-NEXT: vmrs r6, p0 @@ -1158,6 +1158,7 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q4, [r3] ; CHECK-NEXT: adds r3, #16 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: le lr, .LBB7_5 ; CHECK-NEXT: b .LBB7_12 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new @@ -1380,10 +1381,9 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vadd.i32 q2, q1, r4 ; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vcmp.u32 cs, q0, q2 ; CHECK-NEXT: @ implicit-def: $q2 -; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vmrs r6, p0 ; CHECK-NEXT: and r5, r6, #1 ; CHECK-NEXT: rsbs r7, r5, #0 @@ -1454,6 +1454,7 @@ ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q2, [r3] ; CHECK-NEXT: adds r3, #16 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: le lr, .LBB8_2 ; CHECK-NEXT: .LBB8_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 @@ -1550,8 +1551,8 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vldrwt.u32 q1, [r1] Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll @@ -69,8 +69,8 @@ ; CHECK: phi ; CHECK: phi ; CHECK: [[IV:%[^ ]+]] = phi i32 [ %N, %for.cond1.preheader.us ], [ [[REM:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[IV]]) ; CHECK: [[REM]] = sub i32 [[IV]], 4 -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REM]]) ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll @@ -50,8 +50,8 @@ ; CHECK-LABEL: expand_v8i16_v4i32 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]]) ; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8 -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS_REM]]) ; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: %store.pred = icmp ule <4 x i32> %induction.store ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred) Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -5,8 +5,8 @@ ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] ; CHECK: phi i32 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]]) ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) { @@ -63,8 +63,8 @@ ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ] ; CHECK: phi i32 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]]) ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 -; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]]) ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { entry: Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -3,13 +3,14 @@ ; CHECK-LABEL: mul_reduce_add ; CHECK: dls lr, ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] ; CHECK: vpstt ; CHECK-NEXT: vldrwt.u32 ; CHECK-NEXT: vldrwt.u32 +; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] +; CHECK: sub{{.*}} [[ELEMS]], #4 ; CHECK: le lr, [[LOOP]] -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS_OUT]] ; CHECK: vpsel ; CHECK: vaddv.u32 r0 define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { @@ -54,7 +55,17 @@ ret i32 %res.0.lcssa } -; Function Attrs: norecurse nounwind readonly +; CHECK-LABEL: mul_reduce_add_const +; CHECK: dls lr +; CHECK: [[LOOP:.LBB[0-9_]+]]: +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] +; CHECK: vpst +; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0] +; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] +; CHECK: sub{{.*}} [[ELEMS]], #4 +; CHECK: le lr, [[LOOP]] +; CHECK: vctp.32 [[ELEMS_OUT]] +; CHECK: vpsel define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { entry: %cmp6 = icmp eq i32 %N, 0 @@ -96,13 +107,14 @@ ; CHECK-LABEL: add_reduce_add_const ; CHECK: dls lr, lr ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: subs [[ELEMS:r[0-9]+]], #4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] ; CHECK: vpst ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0] +; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]] +; CHECK: sub{{.*}} [[ELEMS]], #4 ; CHECK: vadd.i32 ; CHECK: le lr, [[LOOP]] -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS_OUT]] ; CHECK: vpsel define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { entry: @@ -145,8 +157,8 @@ ; CHECK-LABEL: vector_mul_const ; CHECK: dls lr, lr ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: subs [[ELEMS:r[0-9]+]], #4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] +; CHECK: sub{{.*}} [[ELEMS]], #4 ; CHECK: vpst ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1] ; CHECK: vmul.i32 @@ -192,8 +204,8 @@ ; CHECK-LABEL: vector_add_const ; CHECK: dls lr, lr ; CHECK: [[LOOP:.LBB[0-9_]+]]: -; CHECK: subs [[ELEMS:r[0-9]+]], #4 -; CHECK: vctp.32 [[ELEMS]] +; CHECK: vctp.32 [[ELEMS:r[0-9]+]] +; CHECK: sub{{.*}} [[ELEMS]], #4 ; CHECK: vpst ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1] ; CHECK: vadd.i32 Index: llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll +++ llvm/trunk/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll @@ -6,13 +6,13 @@ ; CHECK: vector.body: ; CHECK-NOT: phi i32 [ 0, %vector.ph ] ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ] +; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]]) ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4 -; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]]) ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]] ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], ; CHECK: middle.block: -; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]]) +; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]]) ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]], ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])