Changeset View
Changeset View
Standalone View
Standalone View
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s | ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s | ||||
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=force-enabled \ | |||||
; RUN: -mattr=+mve %s -S -o - | FileCheck %s --check-prefix=FORCE | |||||
; CHECK-LABEL: reduction_i32 | ; CHECK-LABEL: reduction_i32 | ||||
; CHECK: phi i32 [ 0, %vector.ph ] | ; CHECK: phi i32 [ 0, %vector.ph ] | ||||
; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] | ; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ] | ||||
; CHECK: phi i32 | ; CHECK: phi i32 | ||||
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] | ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ] | ||||
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) | ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]]) | ||||
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 | ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8 | ||||
▲ Show 20 Lines • Show All 119 Lines • ▼ Show 20 Lines | middle.block: ; preds = %vector.body | ||||
%tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 | %tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 | ||||
ret i16 %tmp9 | ret i16 %tmp9 | ||||
for.cond.cleanup: | for.cond.cleanup: | ||||
%res.0 = phi i16 [ 0, %entry ] | %res.0 = phi i16 [ 0, %entry ] | ||||
ret i16 %res.0 | ret i16 %res.0 | ||||
} | } | ||||
; The vector loop is not guarded with an entry check (N == 0). | ; The vector loop is not guarded with an entry check (N == 0). Check that | ||||
; This means we can't calculate a precise range for the backedge count in | ; despite this we can still calculate a precise enough range for the | ||||
; @llvm.get.active.lane.mask, and are assuming overflow can happen and thus | ; backedge count to safely insert a vctp here. | ||||
; we can't insert the VCTP here. | |||||
; | ; | ||||
; CHECK-LABEL: @reduction_not_guarded | ; CHECK-LABEL: @reduction_not_guarded | ||||
; | ; | ||||
; CHECK: vector.body: | ; CHECK: vector.body: | ||||
; CHECK-NOT: @llvm.arm.mve.vctp | ; CHECK: @llvm.arm.mve.vctp | ||||
; CHECK: @llvm.get.active.lane.mask.v8i1.i32 | ; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 | ||||
; CHECK: ret | ; CHECK: ret | ||||
; | ; | ||||
define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { | define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { | ||||
entry: | entry: | ||||
%tmp = add i32 %N, -1 | %tmp = add i32 %N, -1 | ||||
%n.rnd.up = add nuw nsw i32 %tmp, 8 | %n.rnd.up = add nuw nsw i32 %tmp, 8 | ||||
%n.vec = and i32 %n.rnd.up, -8 | %n.vec = and i32 %n.rnd.up, -8 | ||||
%broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 | %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %tmp, i32 0 | ||||
Show All 34 Lines | middle.block: ; preds = %vector.body | ||||
%rdx.shuf5 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | %rdx.shuf5 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | ||||
%bin.rdx6 = add <8 x i16> %rdx.shuf5, %bin.rdx | %bin.rdx6 = add <8 x i16> %rdx.shuf5, %bin.rdx | ||||
%rdx.shuf7 = shufflevector <8 x i16> %bin.rdx6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | ||||
%bin.rdx8 = add <8 x i16> %rdx.shuf7, %bin.rdx6 | %bin.rdx8 = add <8 x i16> %rdx.shuf7, %bin.rdx6 | ||||
%tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 | %tmp9 = extractelement <8 x i16> %bin.rdx8, i32 0 | ||||
ret i16 %tmp9 | ret i16 %tmp9 | ||||
} | } | ||||
; Without forcing tail-predication, we bail because overflow analysis says: | |||||
; | |||||
; overflow possible in: {(-1 + (sext i16 %Size to i32)),+,-1}<nw><%for.body> | |||||
; | |||||
; CHECK-LABEL: @Correlation | ; CHECK-LABEL: @Correlation | ||||
; | |||||
; CHECK: vector.body: | ; CHECK: vector.body: | ||||
; CHECK-NOT: @llvm.arm.mve.vctp | ; CHECK: @llvm.arm.mve.vctp | ||||
; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) | ; CHECK-NOT: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask | ||||
; | |||||
; FORCE-LABEL: @Correlation | |||||
; FORCE: vector.ph: ; preds = %for.body | |||||
; FORCE: %trip.count.minus.1 = add i32 %{{.*}}, -1 | |||||
; FORCE: call void @llvm.set.loop.iterations.i32(i32 %{{.*}}) | |||||
; FORCE: br label %vector.body | |||||
; FORCE: vector.body: ; preds = %vector.body, %vector.ph | |||||
; FORCE: %[[VCTP:.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 %{{.*}}) | |||||
; FORCE: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[VCTP]]{{.*}} | |||||
; | ; | ||||
define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { | define dso_local void @Correlation(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 { | ||||
entry: | entry: | ||||
%conv = sext i16 %N to i32 | %conv = sext i16 %N to i32 | ||||
%cmp36 = icmp sgt i16 %N, 0 | %cmp36 = icmp sgt i16 %N, 0 | ||||
br i1 %cmp36, label %for.body.lr.ph, label %for.end17 | br i1 %cmp36, label %for.body.lr.ph, label %for.end17 | ||||
for.body.lr.ph: | for.body.lr.ph: | ||||
▲ Show 20 Lines • Show All 79 Lines • Show Last 20 Lines |