diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr52111.ll b/llvm/test/Transforms/LoopVectorize/X86/pr52111.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pr52111.ll @@ -0,0 +1,143 @@ +; RUN: opt %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s + +; Test case for PR52111. Make sure that NUW/NSW flags are dropped from +; instructions in blocks that need predication and are linearized and masked +; after vectorization. We need AVX512 target features for the loop to be +; vectorized with masks instead of predicates. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +; CHECK-LABEL: define void @pr52111_scalar_nuw_nus +; CHECK: vector.body: +; CHECK: %[[vecInd:.*]] = phi <4 x i64> [ , %{{.*}} ] +; CHECK: %[[lane0Idx:.*]] = add i64 %index, 0 +; CHECK: %[[cmp:.*]] = icmp eq <4 x i64> %[[vecInd]], zeroinitializer +; We shouldn't have NUW/NSW flags in the following add instruction. +; CHECK: %[[sub:.*]] = sub nuw nsw i64 %[[lane0Idx]], 1 +; CHECK: %[[g0:.*]] = getelementptr inbounds float, float* %{{.*}}, i64 %[[sub]] +; CHECK: %[[mask:.*]] = xor <4 x i1> %[[cmp]], +; CHECK: %[[g1:.*]] = getelementptr inbounds float, float* %[[g0]], i32 0 +; CHECK: %[[bcast:.*]] = bitcast float* %[[g1]] to <4 x float>* +; CHECK: call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %[[bcast]], i32 4, <4 x i1> %[[mask]], <4 x float> poison) +define void @pr52111_scalar_nuw_nus(float* noalias nocapture readonly %input, + float* %output) local_unnamed_addr #0 { +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.inc, %if.end ] + %i23 = icmp eq i64 %iv, 0 + br i1 %i23, label %if.end, label %if.then + +if.then: + %i27 = sub nuw nsw i64 %iv, 1 + %i29 = getelementptr inbounds float, float* %input, i64 %i27 + %i30 = load float, float* %i29, align 4, !invariant.load !0, !noalias !1 + br label %if.end + +if.end: + %i34 = phi float [ 0.000000e+00, %loop.header ], [ %i30, %if.then ] + %i35 = getelementptr inbounds float, float* %output, i64 %iv + store float %i34, float* %i35, align 4, !alias.scope !1 + %iv.inc = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.inc, 4 + br i1 %exitcond, label %loop.exit, label %loop.header + +loop.exit: + ret void +} + +; CHECK-LABEL: define void @pr52111_vector_nuw_nus +; CHECK: vector.body: +; CHECK: %[[vecInd:.*]] = phi <4 x i64> [ , %{{.*}} ] +; CHECK: %[[cmp:.*]] = icmp eq <4 x i64> %[[vecInd]], zeroinitializer +; We shouldn't have NUW/NSW flags in the following sub/mul instructions. +; CHECK: %[[sub:.*]] = sub nuw nsw <4 x i64> %[[vecInd]], +; CHECK: %[[mul:.*]] = mul nuw nsw <4 x i64> %[[sub]], +; CHECK: %[[g0:.*]] = getelementptr inbounds float, float* %{{.*}}, <4 x i64> %[[mul]] +; CHECK: %[[mask:.*]] = xor <4 x i1> %[[cmp]], +; CHECK: call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %[[g0]], i32 4, <4 x i1> %[[mask]], <4 x float> undef) +define void @pr52111_vector_nuw_nus(float* noalias nocapture readonly %input, + float* %output) local_unnamed_addr #0 { +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.inc, %if.end ] + %i23 = icmp eq i64 %iv, 0 + br i1 %i23, label %if.end, label %if.then + +if.then: + %i27 = sub nuw nsw i64 %iv, 1 + %i28 = mul nuw nsw i64 %i27, 2 + %i29 = getelementptr inbounds float, float* %input, i64 %i28 + %i30 = load float, float* %i29, align 4, !invariant.load !0, !noalias !1 + br label %if.end + +if.end: + %i34 = phi float [ 0.000000e+00, %loop.header ], [ %i30, %if.then ] + %i35 = getelementptr inbounds float, float* %output, i64 %iv + store float %i34, float* %i35, align 4, !alias.scope !1 + %iv.inc = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.inc, 4 + br i1 %exitcond, label %loop.exit, label %loop.header + +loop.exit: + ret void +} + +; CHECK-LABEL: define void @pr52111_vector_exact +; CHECK: vector.body: +; CHECK: %[[vecInd:.*]] = phi <4 x i64> [ , %{{.*}} ] +; CHECK: %[[lane0Idx:.*]] = add i64 %index, 0 +; CHECK: %[[cmp0:.*]] = icmp ne <4 x i64> %[[vecInd]], zeroinitializer +; CHECK: %[[and0:.*]] = and <4 x i64> %[[vecInd]], +; CHECK: %[[cmp1:.*]] = icmp eq <4 x i64> %[[and0]], zeroinitializer +; CHECK: %[[and1:.*]] = and <4 x i1> %[[cmp0]], %[[cmp1]] +; We shouldn't have the 'exact' flag in the following div instruction. +; CHECK: %[[div:.*]] = sdiv exact <4 x i64> %[[vecInd]], +; CHECK: %[[g0:.*]] = getelementptr inbounds float, float* %{{.*}}, <4 x i64> %[[div]] +; CHECK: %[[mask:.*]] = xor <4 x i1> %[[and1]], +; CHECK: call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %[[g0]], i32 4, <4 x i1> %[[mask]], <4 x float> undef), +define void @pr52111_vector_exact(float* noalias nocapture readonly %input, + float* %output) local_unnamed_addr #0 { +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.inc, %if.end ] + %i7 = icmp ne i64 %iv, 0 + %i8 = and i64 %iv, 1 + %i9 = icmp eq i64 %i8, 0 + %i10 = and i1 %i7, %i9 + br i1 %i10, label %if.end, label %if.then + +if.then: + %i26 = sdiv exact i64 %iv, 2 + %i29 = getelementptr inbounds float, float* %input, i64 %i26 + %i30 = load float, float* %i29, align 4, !invariant.load !0, !noalias !1 + br label %if.end + +if.end: + %i34 = phi float [ 0.000000e+00, %loop.header ], [ %i30, %if.then ] + %i35 = getelementptr inbounds float, float* %output, i64 %iv + store float %i34, float* %i35, align 4, !alias.scope !1 + %iv.inc = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.inc, 4 + br i1 %exitcond, label %loop.exit, label %loop.header + +loop.exit: + ret void +} + +attributes #0 = { noinline nounwind uwtable "target-features"="+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl" } + +!0 = !{} +!1 = !{!2} +!2 = !{!"buffer: {index:0, offset:0, size:38720}", !3} +!3 = !{!"Global AA domain"} +!4 = distinct !{!4, !5, !6} +!5 = !{!"llvm.loop.vectorize.width", i32 4} +!6 = !{!"llvm.loop.vectorize.enable", i1 true} +