Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -1,5 +1,7 @@ ; RUN: opt < %s -loop-vectorize -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S 2>%t | FileCheck %s -check-prefix=CHECK ; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \ +; RUN: -pass-remarks-missed=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S | FileCheck %s --check-prefix=CHECK-TF ; Reduction can be vectorized @@ -16,6 +18,22 @@ ; CHECK: middle.block: ; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32( %[[ADD]]) + +; CHECK-TF-LABEL: @add +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[ADD1:.*]] = add %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = add %[[LOAD2]] +; CHECK-TF: %[[SEL_ADD1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[ADD1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_ADD2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[ADD2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ADD:.*]] = add %[[SEL_ADD2]], %[[SEL_ADD1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32( %[[ADD]]) entry: br label %for.body @@ -46,6 +64,22 @@ ; CHECK: middle.block: ; CHECK: %[[OR:.*]] = or %[[OR2]], %[[OR1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32( %[[OR]]) + +; CHECK-TF-LABEL: @or +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[OR1:.*]] = or %[[LOAD1]] +; CHECK-TF: %[[OR2:.*]] = or %[[LOAD2]] +; CHECK-TF: %[[SEL_OR1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[OR1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_OR2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[OR2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[OR:.*]] = or %[[SEL_OR2]], %[[SEL_OR1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32( %[[OR]]) entry: br label %for.body @@ -76,6 +110,22 @@ ; CHECK: middle.block: ; CHECK: %[[ABD:.*]] = and %[[ADD2]], %[[AND1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32( %[[ADD]]) + +; CHECK-TF-LABEL: @and +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[AND1:.*]] = and %[[LOAD1]] +; CHECK-TF: %[[AND2:.*]] = and %[[LOAD2]] +; CHECK-TF: %[[SEL_AND1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[AND1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_AND2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[AND2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[AND:.*]] = and %[[SEL_AND2]], %[[SEL_AND1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32( %[[AND]]) entry: br label %for.body @@ -106,6 +156,22 @@ ; CHECK: middle.block: ; CHECK: %[[XOR:.*]] = xor %[[XOR2]], %[[XOR1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32( %[[XOR]]) + +; CHECK-TF-LABEL: @xor +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[XOR1:.*]] = xor %[[LOAD1]] +; CHECK-TF: %[[XOR2:.*]] = xor %[[LOAD2]] +; CHECK-TF: %[[SEL_XOR1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[XOR1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_XOR2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[XOR2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[XOR:.*]] = xor %[[SEL_XOR2]], %[[SEL_XOR1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32( %[[XOR]]) entry: br label %for.body @@ -139,6 +205,25 @@ ; CHECK: %[[ICMP:.*]] = icmp slt %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32( %[[SEL]]) + +; CHECK-TF-LABEL: @smin +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[ICMP1:.*]] = icmp slt %[[LOAD1]] +; CHECK-TF: %[[ICMP2:.*]] = icmp slt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ICMP:.*]] = icmp slt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32( %[[SEL]]) entry: br label %for.body @@ -173,6 +258,25 @@ ; CHECK: %[[ICMP:.*]] = icmp ugt %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32( %[[SEL]]) + +; CHECK-TF-LABEL: @umax +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[ICMP1:.*]] = icmp ugt %[[LOAD1]] +; CHECK-TF: %[[ICMP2:.*]] = icmp ugt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ICMP:.*]] = icmp ugt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32( %[[SEL]]) entry: br label %for.body @@ -204,6 +308,22 @@ ; CHECK: middle.block: ; CHECK: %[[ADD:.*]] = fadd fast %[[ADD2]], %[[ADD1]] ; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[ADD]]) + +; CHECK-TF-LABEL: @fadd_fast +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[ADD1:.*]] = fadd fast %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = fadd fast %[[LOAD2]] +; CHECK-TF: %[[SEL_ADD1:.*]] = select fast %[[ACTIVE_LANE_MASK1]], %[[ADD1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_ADD2:.*]] = select fast %[[ACTIVE_LANE_MASK2]], %[[ADD2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ADD:.*]] = fadd fast %[[SEL_ADD2]], %[[SEL_ADD1]] +; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[ADD]]) entry: br label %for.body @@ -233,6 +353,22 @@ ; CHECK: middle.block: ; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]] ; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]]) + +; CHECK-TF-LABEL: @fadd_fast_bfloat +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi <8 x i1> +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi <8 x i1> +; CHECK-TF: %[[VEC_PHI1:.*]] = phi <8 x bfloat> +; CHECK-TF: %[[VEC_PHI2:.*]] = phi <8 x bfloat> +; CHECK-TF: %[[LOAD1:.*]] = call <8 x bfloat> @llvm.masked.load.v8bf16 +; CHECK-TF: %[[LOAD2:.*]] = call <8 x bfloat> @llvm.masked.load.v8bf16 +; CHECK-TF: %[[ADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]] +; CHECK-TF: %[[SEL_ADD1:.*]] = select fast <8 x i1> %[[ACTIVE_LANE_MASK1]], <8 x bfloat> %[[ADD1]], <8 x bfloat> %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_ADD2:.*]] = select fast <8 x i1> %[[ACTIVE_LANE_MASK2]], <8 x bfloat> %[[ADD2]], <8 x bfloat> %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ADD:.*]] = fadd fast <8 x bfloat> %[[SEL_ADD2]], %[[SEL_ADD1]] +; CHECK-TF-NEXT: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[ADD]]) entry: br label %for.body @@ -258,14 +394,33 @@ ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load ; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[FCMP1:.*]] = fcmp olt %[[LOAD1]] -; CHECK: %[[FCMP2:.*]] = fcmp olt %[[LOAD2]] +; CHECK: %[[FCMP1:.*]] = fcmp fast olt %[[LOAD1]] +; CHECK: %[[FCMP2:.*]] = fcmp fast olt %[[LOAD2]] ; CHECK: %[[SEL1:.*]] = select %[[FCMP1]], %[[LOAD1]] ; CHECK: %[[SEL2:.*]] = select %[[FCMP2]], %[[LOAD2]] ; CHECK: middle.block: -; CHECK: %[[FCMP:.*]] = fcmp olt %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: %[[SEL:.*]] = select %[[FCMP]], %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32( %[[SEL]]) +; CHECK: %[[FCMP:.*]] = fcmp fast olt %[[SEL1]], %[[SEL2]] +; CHECK-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL1]], %[[SEL2]] +; CHECK-NEXT: call fast float @llvm.vector.reduce.fmin.nxv8f32( %[[SEL]]) + +; CHECK-TF-LABEL: @fmin_fast +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[FCMP1:.*]] = fcmp fast olt %[[LOAD1]] +; CHECK-TF: %[[FCMP2:.*]] = fcmp fast olt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select fast %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select fast %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[FCMP:.*]] = fcmp fast olt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fmin.nxv8f32( %[[SEL]]) entry: br label %for.body @@ -274,7 +429,7 @@ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ] %arrayidx = getelementptr inbounds float, float* %a, i64 %iv %0 = load float, float* %arrayidx, align 4 - %cmp.i = fcmp olt float %0, %sum.07 + %cmp.i = fcmp fast olt float %0, %sum.07 %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07 %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %n @@ -300,6 +455,25 @@ ; CHECK: %[[FCMP:.*]] = fcmp fast ogt %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32( %[[SEL]]) + +; CHECK-TF-LABEL: @fmax_fast +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[FCMP1:.*]] = fcmp fast ogt %[[LOAD1]] +; CHECK-TF: %[[FCMP2:.*]] = fcmp fast ogt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select fast %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select fast %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[FCMP:.*]] = fcmp fast ogt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32( %[[SEL]]) entry: br label %for.body @@ -332,6 +506,17 @@ ; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[ADD]]) ; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4 + +; CHECK-TF-LABEL: @invariant_store +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF: %[[ADD1:.*]] = add %{{.*}}, %[[LOAD1]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[ADD1]], +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[SEL]]) +; CHECK-TF-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4 + entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 store i32 0, i32* %gep.dst, align 4 @@ -367,6 +552,22 @@ ; CHECK: middle.block: ; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] ; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) + +; CHECK-TF-LABEL: @mul +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi <2 x i1> +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi <2 x i1> +; CHECK-TF: %[[VEC_PHI1:.*]] = phi <2 x i32> +; CHECK-TF: %[[VEC_PHI2:.*]] = phi <2 x i32> +; CHECK-TF: %[[LOAD1:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD2:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[MUL1:.*]] = mul <2 x i32> %[[LOAD1]] +; CHECK-TF: %[[MUL2:.*]] = mul <2 x i32> %[[LOAD2]] +; CHECK-TF: %[[SEL_MUL1:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK1]], <2 x i32> %[[MUL1]], <2 x i32> %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_MUL2:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK2]], <2 x i32> %[[MUL2]], <2 x i32> %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[RDX:.*]] = mul <2 x i32> %[[SEL_MUL2]], %[[SEL_MUL1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %[[RDX]]) entry: br label %for.body @@ -401,6 +602,26 @@ ; CHECK: middle.block: ; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] ; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) + +; CHECK-TF-LABEL: @memory_dependence +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi <2 x i1> +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi <2 x i1> +; CHECK-TF: %[[VEC_PHI1:.*]] = phi <2 x i32> +; CHECK-TF: %[[VEC_PHI2:.*]] = phi <2 x i32> +; CHECK-TF: %[[LOAD1:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD2:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD3:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD4:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[ADD1:.*]] = add nsw <2 x i32> %[[LOAD3]], %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = add nsw <2 x i32> %[[LOAD4]], %[[LOAD2]] +; CHECK-TF: %[[MUL1:.*]] = mul <2 x i32> %[[LOAD3]] +; CHECK-TF: %[[MUL2:.*]] = mul <2 x i32> %[[LOAD4]] +; CHECK-TF: %[[SEL_MUL1:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK1]], <2 x i32> %[[MUL1]], <2 x i32> %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_MUL2:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK2]], <2 x i32> %[[MUL2]], <2 x i32> %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[RDX:.*]] = mul <2 x i32> %[[SEL_MUL2]], %[[SEL_MUL1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %[[RDX]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll @@ -1,4 +1,6 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve < %s -S | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s -S | FileCheck %s --check-prefix=CHECK-TF target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -13,6 +15,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i32 10, i32 0), poison, zeroinitializer) ; CHECK: store [[TMP2]], * {{.*}}, align 4 ; +; CHECK-TF-LABEL: @cmpsel_i32( +; CHECK-TF-NEXT: entry: +; CHECK-TF: vector.body: +; CHECK-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF-NEXT: [[TMP1:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i32 10, i32 0), poison, zeroinitializer) +; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP2]] +; entry: br label %for.body @@ -44,6 +54,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, float 1.000000e+01, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) ; CHECK: store [[TMP2]], * {{.*}}, align 4 +; CHECK-TF-LABEL: @cmpsel_f32( +; CHECK-TF-NEXT: entry: +; CHECK-TF: vector.body: +; CHECK-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv4f32 +; CHECK-TF-NEXT: [[TMP1:%.*]] = fcmp ogt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, float 1.000000e+01, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-TF: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP2]] +; entry: br label %for.body @@ -71,6 +89,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = fneg [[WIDE_LOAD]] ; CHECK: store [[TMP1]], * {{.*}}, align 4 +; CHECK-TF-LABEL: @fneg_f32( +; CHECK-TF-NEXT: entry: +; CHECK-TF: vector.body: +; CHECK-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv4f32 +; CHECK-TF-NEXT: [[TMP1:%.*]] = fneg [[WIDE_LOAD]] +; CHECK-TF: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP1]] entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s -o - | FileCheck %s --check-prefix=CHECK-TF define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) #0 { ; CHECK-LABEL: @cond_inv_load_i32i32i16( @@ -57,6 +59,69 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @cond_inv_load_i32i32i16( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[BROADCAST_SPLAT]], i32 2, [[TMP12]], undef) +; CHECK-TF-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP13]], * [[TMP15]], i32 4, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[I_07]] +; CHECK-TF-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP19]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[TMP20:%.*]] = load i16, i16* [[INV]], align 2 +; CHECK-TF-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_07]] +; CHECK-TF-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX1]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -137,6 +202,67 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @cond_inv_load_f64f64f64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[COND:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f64.p0nxv4f64(* [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = fcmp ogt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 4.000000e-01, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f64.nxv4p0f64( [[BROADCAST_SPLAT]], i32 8, [[TMP12]], undef) +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f64.p0nxv4f64( [[WIDE_MASKED_GATHER]], * [[TMP14]], i32 8, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[I_08]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[CMP1:%.*]] = fcmp ogt double [[TMP18]], 4.000000e-01 +; CHECK-TF-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[TMP19:%.*]] = load double, double* [[INV]], align 8 +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] +; CHECK-TF-NEXT: store double [[TMP19]], double* [[ARRAYIDX2]], align 8 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -225,6 +351,76 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @invariant_load_cond( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 42 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32* [[TMP9]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP13]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP15]], i32 4, [[TMP14]], poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[DOTSPLAT]], i32 4, [[TMP14]], undef) +; CHECK-TF-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD1]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP16]], * [[TMP18]], i32 4, [[TMP14]]) +; CHECK-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP20:%.*]] = shl nuw nsw i64 [[TMP19]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]] +; CHECK-TF-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP22]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 42 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]] +; CHECK-TF-NEXT: [[TMP23:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-TF-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]] +; CHECK-TF-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] +; CHECK-TF-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: for.end: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -mattr=+sve < %s -S | FileCheck %s +; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s -S | FileCheck %s --check-prefix=CHECK-TF target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" @@ -77,6 +79,73 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; +; CHECK-TF-LABEL: @fneg( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[S2:%.*]] = ptrtoint ptr [[S:%.*]] to i64 +; CHECK-TF-NEXT: [[D1:%.*]] = ptrtoint ptr [[D:%.*]] to i64 +; CHECK-TF-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-TF-NEXT: [[TMP0:%.*]] = sub i64 -1, [[WIDE_TRIP_COUNT]] +; CHECK-TF-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-TF-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-TF: vector.memcheck: +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; CHECK-TF-NEXT: [[TMP5:%.*]] = sub i64 [[D1]], [[S2]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 +; CHECK-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 +; CHECK-TF-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], 1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP10]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds half, ptr [[TMP12]], i32 0 +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f16.p0(ptr [[TMP13]], i32 2, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP14:%.*]] = fneg [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, ptr [[TMP15]], i32 0 +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv8f16.p0( [[TMP14]], ptr [[TMP16]], i32 2, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 8 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-TF-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 +; CHECK-TF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP21:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-TF-NEXT: [[FNEG:%.*]] = fneg half [[TMP21]] +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: store half [[FNEG]], ptr [[ARRAYIDX2]], align 2 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll @@ -1,10 +1,14 @@ ; REQUIRES: asserts ; RUN: opt -loop-vectorize -mcpu=neoverse-v1 -disable-output %s -debug 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -mcpu=neoverse-v1 -disable-output %s -debug \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue 2>&1 | FileCheck %s --check-prefix=CHECK-TF target triple="aarch64--linux-gnu" ; CHECK: LV: Checking a loop in 'gather_nxv4i32_loaded_index' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %1 = load float, float* %arrayidx3, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_loaded_index' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %1 = load float, float* %arrayidx3, align 4 define void @gather_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { entry: br label %for.body @@ -27,6 +31,8 @@ ; CHECK: LV: Checking a loop in 'scatter_nxv4i32_loaded_index' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %1, float* %arrayidx5, align 4 +; CHECK-TF: LV: Checking a loop in 'scatter_nxv4i32_loaded_index' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %1, float* %arrayidx5, align 4 define void @scatter_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { entry: br label %for.body @@ -51,6 +57,8 @@ ; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and a cost of 1. ; CHECK: LV: Checking a loop in 'gather_nxv4i32_unknown_stride' ; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_unknown_stride' +; CHECK-TF: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 define void @gather_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 { entry: br label %for.body @@ -74,6 +82,8 @@ ; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and cost is 1. ; CHECK: LV: Checking a loop in 'scatter_nxv4i32_unknown_stride' ; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 +; CHECK-TF: LV: Checking a loop in 'scatter_nxv4i32_unknown_stride' +; CHECK-TF: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 define void @scatter_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 { entry: br label %for.body @@ -95,6 +105,8 @@ ; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride2' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_stride2' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 define void @gather_nxv4i32_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body @@ -138,6 +150,8 @@ ; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride64' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_stride64' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 define void @gather_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body @@ -159,6 +173,8 @@ ; CHECK: LV: Checking a loop in 'scatter_nxv4i32_stride64' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 +; CHECK-TF: LV: Checking a loop in 'scatter_nxv4i32_stride64' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 define void @scatter_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -1,5 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -force-target-instruction-cost=1 -o - | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s \ +; RUN: -force-target-instruction-cost=1 -o - | FileCheck %s --check-prefix=CHECK-TF define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { ; CHECK-LABEL: @gather_nxv4i32_ind64( @@ -49,6 +52,60 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @gather_nxv4i32_ind64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i64.p0nxv4i64(* [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[WIDE_MASKED_GATHER]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: store float [[TMP18]], float* [[ARRAYIDX5]], align 4 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -121,6 +178,62 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @scatter_nxv4i32_ind32( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], [[TMP13]] +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( [[WIDE_MASKED_LOAD]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[IDXPROM4:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-TF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDXPROM4]] +; CHECK-TF-NEXT: store float [[TMP18]], float* [[ARRAYIDX5]], align 4 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -190,6 +303,62 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @scatter_inv_nxv4i32( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), [[BROADCAST_SPLAT]], i32 4, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP15]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: store i32 3, i32* [[INV]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -265,6 +434,65 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @gather_inv_nxv4i32( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP12]], undef) +; CHECK-TF-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_GATHER]], * [[TMP13]], i32 4, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[TMP17]], 3 +; CHECK-TF-NEXT: br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[TMP18:%.*]] = load i32, i32* [[INV]], align 4 +; CHECK-TF-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -353,6 +581,64 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @gather_nxv4i32_ind64_stride2( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP12:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], [[TMP12]] +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[WIDE_MASKED_GATHER]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_STRIDE2]] +; CHECK-TF-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: store float [[TMP19]], float* [[ARRAYIDX2]], align 4 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll @@ -1,6 +1,8 @@ ; REQUIRES: asserts ; RUN: opt -loop-vectorize -S < %s -debug 2>%t | FileCheck %s ; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt -loop-vectorize -S < %s -debug -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG target triple = "aarch64-unknown-linux-gnu" @@ -12,24 +14,24 @@ ; CHECK-LABEL: @induction_i7( ; CHECK: vector.ph: ; CHECK: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i8() -; CHECK: [[TMP5:%.*]] = trunc %4 to -; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK: [[TMP5:%.*]] = trunc [[TMP4]] to +; CHECK: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i7 1, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = add [[VEC_IND]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[EXT:%.+]] = zext [[TMP11]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to * -; CHECK-NEXT: store [[EXT]], * [[TMP14]], align 8 +; CHECK-NEXT: store{{.*}} [[EXT]], * [[TMP14]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add {{.*}}i64 [[INDEX]], [[TMP16]] +; CHECK: [[VEC_IND_NEXT]] = add [[VEC_IND]], ; entry: br label %for.body @@ -59,23 +61,23 @@ ; CHECK-LABEL: @induction_i3_zext( ; CHECK: vector.ph: ; CHECK: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i8() -; CHECK: [[TMP5:%.*]] = trunc %4 to +; CHECK: [[TMP5:%.*]] = trunc [[TMP4]] to ; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i3 1, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = zext [[VEC_IND]] to ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to * -; CHECK-NEXT: store [[TMP10]], * [[TMP14]], align 8 +; CHECK-NEXT: store{{.*}} [[TMP10]], * [[TMP14]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add {{.*}}i64 [[INDEX]], [[TMP16]] +; CHECK: [[VEC_IND_NEXT]] = add [[VEC_IND]], ; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -dce -instcombine < %s -S | FileCheck %s +; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -dce -instcombine \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s -S | FileCheck %s --check-prefix=CHECK-TF target triple = "aarch64-linux-gnu" @@ -68,6 +70,71 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @cond_ind64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP12:%.*]] = trunc [[VEC_IND]] to +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP13]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP15]], i32 4, [[TMP14]], poison) +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP17]], i32 4, [[TMP14]]) +; CHECK-TF-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP19:%.*]] = shl i64 [[TMP18]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-TF-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP20]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[AND:%.*]] = and i64 [[I_08]], 1 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[AND]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_08]] +; CHECK-TF-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]] +; CHECK-TF-NEXT: store i32 [[TMP21]], i32* [[ARRAYIDX1]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll @@ -1,4 +1,6 @@ ; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF define void @invariant_load(i64 %n, i32* noalias nocapture %a, i32* nocapture readonly %b) { ; CHECK-LABEL: @invariant_load @@ -10,6 +12,16 @@ ; CHECK: %[[LOAD:.*]] = load , * ; CHECK-NEXT: %[[ADD:.*]] = add nsw %[[SPLAT]], %[[LOAD]] ; CHECK: store %[[ADD]], * + +; CHECK-TF-LABEL: @invariant_load +; CHECK-TF: vector.body: +; CHECK-TF: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42 +; CHECK-TF-NEXT: %[[INVLOAD:.*]] = load i32, i32* %[[GEP]] +; CHECK-TF-NEXT: %[[SPLATINS:.*]] = insertelement poison, i32 %[[INVLOAD]], i32 0 +; CHECK-TF-NEXT: %[[SPLAT:.*]] = shufflevector %[[SPLATINS]], poison, zeroinitializer +; CHECK-TF: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* +; CHECK-TF-NEXT: %[[ADD:.*]] = add nsw %[[SPLAT]], %[[LOAD]] +; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[ADD]] entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -1,4 +1,5 @@ ; RUN: opt -loop-vectorize -S < %s | FileCheck %s +; RUN: opt -loop-vectorize -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF target triple = "aarch64-unknown-linux-gnu" @@ -10,6 +11,14 @@ ; CHECK: vector.body: ; CHECK: %[[VECLOAD:.*]] = load , * %{{.*}}, align 2 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %[[VECLOAD]], %[[SPLAT_PTRS]], i32 2 + +; CHECK-TF-LABEL: @inv_store_i16( +; CHECK-TF: vector.ph: +; CHECK-TF: %[[TMP1:.*]] = insertelement poison, i16* %dst, i32 0 +; CHECK-TF-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-TF: vector.body: +; CHECK-TF: %[[VECLOAD:.*]] = call @llvm.masked.load.nxv4i16.p0nxv4i16( +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %[[VECLOAD]] entry: br label %for.body14 @@ -36,6 +45,17 @@ ; CHECK: %[[VECLOAD:.*]] = load , * %{{.*}}, align 4 ; CHECK-NEXT: %[[MASK:.*]] = icmp sgt %[[VECLOAD]], zeroinitializer ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VECLOAD]], %[[SPLAT_PTRS]], i32 4, %[[MASK]]) + +; CHECK-TF-LABEL: @cond_inv_store_i32( +; CHECK-TF: vector.ph: +; CHECK-TF: %[[TMP1:.*]] = insertelement poison, i32* %dst, i32 0 +; CHECK-TF-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[VECLOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32( +; CHECK-TF-NEXT: %[[MASK:.*]] = icmp sgt %[[VECLOAD]], zeroinitializer +; CHECK-TF: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VECLOAD]], %[[SPLAT_PTRS]], i32 4, %[[SEL]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll @@ -1,4 +1,6 @@ ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S <%s | FileCheck %s +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue <%s | FileCheck %s --check-prefix=CHECK-TF define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) #0 { ; CHECK-LABEL: @stride7_i32( @@ -9,6 +11,15 @@ ; CHECK-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %[[PTRS]] ; CHECK-NEXT: %[[VALS:.*]] = add nsw %[[GLOAD]], ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VALS]], %[[PTRS]] + +; CHECK-TF-LABEL: @stride7_i32( +; CHECK-TF: vector.body +; CHECK-TF: %[[VEC_IND:.*]] = phi [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-TF-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 7, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: %[[PTRS:.*]] = getelementptr inbounds i32, i32* %dst, %[[PTR_INDICES]] +; CHECK-TF-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %[[PTRS]] +; CHECK-TF-NEXT: %[[VALS:.*]] = add nsw %[[GLOAD]], +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VALS]], %[[PTRS]] entry: br label %for.body @@ -36,6 +47,15 @@ ; CHECK-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], ; CHECK-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], + +; CHECK-TF-LABEL: @stride7_f64( +; CHECK-TF: vector.body +; CHECK-TF: %[[VEC_IND:.*]] = phi [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-TF-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 7, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, %[[PTR_INDICES]] +; CHECK-TF-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], +; CHECK-TF-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], entry: br label %for.body @@ -63,6 +83,16 @@ ; CHECK-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], i32 8, %[[MASK]] ; CHECK-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], i32 8, %[[MASK]]) + +; CHECK-TF-LABEL: @cond_stride7_f64( +; CHECK-TF: vector.body +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[MASK:.*]] = icmp ne +; CHECK-TF: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, %{{.*}} +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], i32 8, %[[SEL]] +; CHECK-TF-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], i32 8, %[[SEL]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll @@ -1,4 +1,6 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s -o - | FileCheck %s --check-prefix=CHECK-TF define void @mloadstore_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @mloadstore_f32 @@ -11,6 +13,19 @@ ; CHECK-NEXT: %[[FADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] ; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast float* %[[GEPA]] to * ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[MASK]]) + +; CHECK-TF-LABEL: @mloadstore_f32 +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4f32 +; CHECK-TF-NEXT: %[[MASK:.*]] = fcmp ogt %[[LOAD1]], +; CHECK-TF-NEXT: %[[GEPA:.*]] = getelementptr float, float* %a, +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to * +; CHECK-TF-NEXT: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* %[[MLOAD_PTRS]], i32 4, %[[SEL]] +; CHECK-TF-NEXT: %[[FADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] +; CHECK-TF-NEXT: %[[MSTORE_PTRS:.*]] = bitcast float* %[[GEPA]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[SEL]]) entry: br label %for.body @@ -48,6 +63,19 @@ ; CHECK-NEXT: %[[FADD:.*]] = add %[[LOAD1]], %[[LOAD2]] ; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast i32* %[[GEPA]] to * ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[MASK]]) + +; CHECK-TF-LABEL: @mloadstore_i32 +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF-NEXT: %[[MASK:.*]] = icmp ne %[[LOAD1]], +; CHECK-TF-NEXT: %[[GEPA:.*]] = getelementptr i32, i32* %a, +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to * +; CHECK-TF-NEXT: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %[[MLOAD_PTRS]], i32 4, %[[SEL]] +; CHECK-TF-NEXT: %[[FADD:.*]] = add %[[LOAD1]], %[[LOAD2]] +; CHECK-TF-NEXT: %[[MSTORE_PTRS:.*]] = bitcast i32* %[[GEPA]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[SEL]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll @@ -1,5 +1,7 @@ ; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 ; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 +; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF-VF4IC1 target triple = "aarch64-linux-gnu" @@ -39,6 +41,18 @@ ; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL7]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) ; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) ; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +; CHECK-TF-VF4IC1-LABEL: @select_const_i32_from_icmp +; CHECK-TF-VF4IC1: vector.body: +; CHECK-TF-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-TF-VF4IC1: [[VEC_LOAD:%.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-TF-VF4IC1-NEXT: [[VEC_SEL1:%.*]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-TF-VF4IC1-NEXT: [[VEC_SEL2:%.*]] = select %active.lane.mask, [[VEC_SEL1]], [[VEC_PHI]] +; CHECK-TF-VF4IC1: middle.block: +; CHECK-TF-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL2]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-TF-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-TF-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 entry: br label %for.body @@ -78,6 +92,9 @@ ; CHECK-VF4IC4-LABEL: @select_i32_from_icmp ; CHECK-VF4IC4: vector.body: + +; CHECK-TF-VF4IC1-LABEL: @select_i32_from_icmp +; CHECK-TF-VF4IC1: vector.body: entry: br label %for.body @@ -110,6 +127,9 @@ ; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp ; CHECK-VF4IC4: vector.body: + +; CHECK-TF-VF4IC1-LABEL: @select_const_i32_from_fcmp +; CHECK-TF-VF4IC1: vector.body: entry: br label %for.body @@ -133,6 +153,8 @@ ; CHECK-VF4IC1-NOT: vector.body ; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp ; CHECK-VF4IC4-NOT: vector.body +; CHECK-TF-VF4IC1-LABEL: @select_const_f32_from_icmp +; CHECK-TF-VF4IC1-NOT: vector.body entry: br label %for.body @@ -168,6 +190,9 @@ ; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp ; CHECK-VF4IC4: vector.body: + +; CHECK-TF-VF4IC1-LABEL: @pred_select_const_i32_from_icmp +; CHECK-TF-VF4IC1: vector.body: entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -1,16 +1,21 @@ ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \ -; RUN: -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK ; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \ -; RUN: -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 +; RUN: -force-vector-interleave=1 -mcpu=neoverse-n2 -S 2>&1 | FileCheck %s --check-prefix=CHECK-CPU-NEOVERSE-N2 ; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \ -; RUN: -force-vector-width=4 -force-vector-interleave=1 -mcpu=neoverse-n2 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4-CPU-NEOVERSE-N2 +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-interleave=1 \ +; RUN: -S 2>&1 | FileCheck %s --check-prefix=CHECK-TF target triple="aarch64-unknown-linux-gnu" -; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 -; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07 +; CHECK: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07 +; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 + +; CHECK-TF: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07 +; CHECK-TF: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { entry: @@ -31,9 +36,9 @@ } -; CHECK-VF4: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 32 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 -; CHECK-VF4-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07 +; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07 +; CHECK-TF: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07 define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll @@ -11,12 +11,15 @@ ; The test checks if the mask is being correctly created, reverted and used ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 { ; CHECK-LABEL: vector.body: +; CHECK: %{{.*}} = fcmp une ; CHECK: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) ; CHECK: %[[WIDEMSKLOAD:.*]] = call @llvm.masked.load.nxv4f64.p0nxv4f64(* %{{.*}}, i32 8, %[[REVERSE6]], poison) ; CHECK-NEXT: %[[FADD:.*]] = fadd %[[WIDEMSKLOAD]] Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -6,39 +6,30 @@ ; a[i] = b[i] + 1.0; ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF -define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{ +define void @vector_reverse_f64(i64 %N, double* noalias %a, double* noalias %b) #0{ ; CHECK-LABEL: @vector_reverse_f64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A2:%.*]] = ptrtoint double* [[A:%.*]] to i64 -; CHECK-NEXT: [[B1:%.*]] = ptrtoint double* [[B:%.*]] to i64 ; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 6 -; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[B1]] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP4]], [[A2]] -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP7]], [[TMP3]] -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP7]], -8 ; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[DOTNEG]], 1 @@ -46,11 +37,11 @@ ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to * ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP14]], -8 -; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[DOTNEG7]], 1 +; CHECK-NEXT: [[DOTNEG2:%.*]] = mul i32 [[TMP14]], -8 +; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[DOTNEG2]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP12]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to * @@ -59,12 +50,12 @@ ; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -79,7 +70,79 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] ; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; +; CHECK-TF-LABEL: @vector_reverse_f64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 3 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 3 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC_NEG:%.*]] = sub i64 [[N_MOD_VF]], [[N_RND_UP]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC_NEG]], [[N]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-TF-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], [[N]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP12]], -8 +; CHECK-TF-NEXT: [[TMP13:%.*]] = or i32 [[DOTNEG]], 1 +; CHECK-TF-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP11]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f64.p0nxv8f64(* nonnull [[TMP16]], i32 8, [[REVERSE]], poison) +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG4:%.*]] = mul i32 [[TMP19]], -8 +; CHECK-TF-NEXT: [[TMP20:%.*]] = or i32 [[DOTNEG4]], 1 +; CHECK-TF-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 +; CHECK-TF-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP17]], i64 [[TMP21]] +; CHECK-TF-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv8f64.p0nxv8f64( [[TMP18]], * [[TMP23]], i32 8, [[REVERSE3]]) +; CHECK-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP25:%.*]] = shl i64 [[TMP24]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP26:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP26]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[I_08]] +; CHECK-TF-NEXT: [[TMP27:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[ADD:%.*]] = fadd double [[TMP27]], 1.000000e+00 +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] +; CHECK-TF-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8 +; CHECK-TF-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 +; CHECK-TF-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: %cmp7 = icmp sgt i64 %N, 0 @@ -130,30 +193,30 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP7]], -8 -; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[DOTNEG]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP14]], -8 -; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[DOTNEG7]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to * -; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], [[N]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP13]], -8 +; CHECK-NEXT: [[TMP14:%.*]] = or i32 [[DOTNEG]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP17]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG4:%.*]] = mul i32 [[TMP20]], -8 +; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[DOTNEG4]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP18]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to * +; CHECK-NEXT: store [[TMP19]], * [[TMP24]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = shl i64 [[TMP25]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -168,12 +231,95 @@ ; CHECK-NEXT: [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_09]] = add nsw i64 [[I_09_IN]], -1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_09]] -; CHECK-NEXT: [[TMP22:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[TMP22]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[TMP28]], 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]] ; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; +; CHECK-TF-LABEL: @vector_reverse_i64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[A2:%.*]] = ptrtoint i64* [[A:%.*]] to i64 +; CHECK-TF-NEXT: [[B1:%.*]] = ptrtoint i64* [[B:%.*]] to i64 +; CHECK-TF-NEXT: [[CMP8:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-TF: vector.memcheck: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 6 +; CHECK-TF-NEXT: [[TMP6:%.*]] = shl i64 [[N]], 3 +; CHECK-TF-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], [[B1]] +; CHECK-TF-NEXT: [[TMP8:%.*]] = add i64 [[TMP6]], [[A2]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP5]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 3 +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 3 +; CHECK-TF-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP14]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP11]] +; CHECK-TF-NEXT: [[N_VEC_NEG:%.*]] = sub i64 [[N_MOD_VF]], [[N_RND_UP]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC_NEG]], [[N]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP15:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-TF-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], [[N]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP16]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP18]], -8 +; CHECK-TF-NEXT: [[TMP19:%.*]] = or i32 [[DOTNEG]], 1 +; CHECK-TF-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-TF-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, i64* [[TMP17]], i64 [[TMP20]] +; CHECK-TF-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP22:%.*]] = bitcast i64* [[TMP21]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i64.p0nxv8i64(* nonnull [[TMP22]], i32 8, [[REVERSE]], poison) +; CHECK-TF-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP16]] +; CHECK-TF-NEXT: [[TMP24:%.*]] = add [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG6:%.*]] = mul i32 [[TMP25]], -8 +; CHECK-TF-NEXT: [[TMP26:%.*]] = or i32 [[DOTNEG6]], 1 +; CHECK-TF-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 +; CHECK-TF-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP23]], i64 [[TMP27]] +; CHECK-TF-NEXT: [[REVERSE5:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP29:%.*]] = bitcast i64* [[TMP28]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv8i64.p0nxv8i64( [[TMP24]], * [[TMP29]], i32 8, [[REVERSE5]]) +; CHECK-TF-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP31:%.*]] = shl i64 [[TMP30]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP31]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP32:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP32]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[I_09]] = add nsw i64 [[I_09_IN]], -1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_09]] +; CHECK-TF-NEXT: [[TMP33:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[ADD:%.*]] = add i64 [[TMP33]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]] +; CHECK-TF-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 +; CHECK-TF-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 +; CHECK-TF-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; entry: %cmp8 = icmp sgt i64 %N, 0 Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts ; RUN: opt -loop-vectorize -S -mtriple=aarch64 -mattr=+sve -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -S -mtriple=aarch64 -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s 2>&1 | FileCheck %s --check-prefix=CHECK-TF target triple = "aarch64-unknown-linux-gnu" @@ -45,53 +47,53 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8*, i8** [[START_1:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP7]], 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP12:%.*]] = add [[DOTSPLAT]], [[TMP11]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, [[TMP13]], i64 1 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8** [[TMP15]] to * -; CHECK-NEXT: store [[TMP14]], * [[TMP16]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP19]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP18]] to * -; CHECK-NEXT: store [[TMP20]], * [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP11:%.*]] = add [[DOTSPLAT]], [[TMP10]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, [[TMP12]], i64 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8** [[TMP14]] to * +; CHECK-NEXT: store [[TMP13]], * [[TMP15]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, i8* [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP18]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP17]] to * +; CHECK-NEXT: store [[TMP19]], * [[TMP20]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8** [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START_1]], [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8* [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[START_2]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START_2]], [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP_BODY:%.*]] ; CHECK: loop.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_BODY]] ] ; CHECK-NEXT: [[PTR_IV_1:%.*]] = phi i8** [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[PTR_IV_1_NEXT:%.*]], [[LOOP_BODY]] ] -; CHECK-NEXT: [[PTR_IV_2:%.*]] = phi i8* [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-NEXT: [[PTR_IV_2:%.*]] = phi i8* [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], [[LOOP_BODY]] ] ; CHECK-NEXT: [[PTR_IV_1_NEXT]] = getelementptr inbounds i8*, i8** [[PTR_IV_1]], i64 1 ; CHECK-NEXT: [[PTR_IV_2_NEXT]] = getelementptr inbounds i8, i8* [[PTR_IV_2]], i64 1 ; CHECK-NEXT: store i8* [[PTR_IV_2_NEXT]], i8** [[PTR_IV_1]], align 8 @@ -104,6 +106,85 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @pointer_induction_used_as_vector( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i8*, i8** [[START_1:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[IND_END2:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP9]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 1 +; CHECK-TF-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP11]], 0 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP14]], i32 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: [[TMP15:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = add [[DOTSPLAT]], [[TMP15]] +; CHECK-TF-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP16]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, [[TMP17]], i64 1 +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 +; CHECK-TF-NEXT: [[TMP20:%.*]] = bitcast i8** [[TMP19]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2p0i8.p0nxv2p0i8( [[TMP18]], * [[TMP20]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = getelementptr i8, i8* [[TMP21]], i32 0 +; CHECK-TF-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP22]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0nxv2i8(* [[TMP23]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP24:%.*]] = add [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP22]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2i8.p0nxv2i8( [[TMP24]], * [[TMP25]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP28:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP13]] +; CHECK-TF-NEXT: [[TMP29:%.*]] = extractelement [[TMP28]], i32 0 +; CHECK-TF-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8** [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START_1]], [[ENTRY]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START_2]], [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK-TF: loop.body: +; CHECK-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-TF-NEXT: [[PTR_IV_1:%.*]] = phi i8** [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[PTR_IV_1_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-TF-NEXT: [[PTR_IV_2:%.*]] = phi i8* [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-TF-NEXT: [[PTR_IV_1_NEXT]] = getelementptr inbounds i8*, i8** [[PTR_IV_1]], i64 1 +; CHECK-TF-NEXT: [[PTR_IV_2_NEXT]] = getelementptr inbounds i8, i8* [[PTR_IV_2]], i64 1 +; CHECK-TF-NEXT: store i8* [[PTR_IV_2_NEXT]], i8** [[PTR_IV_1]], align 8 +; CHECK-TF-NEXT: [[LV:%.*]] = load i8, i8* [[PTR_IV_2]], align 1 +; CHECK-TF-NEXT: [[ADD:%.*]] = add i8 [[LV]], 1 +; CHECK-TF-NEXT: store i8 [[ADD]], i8* [[PTR_IV_2]], align 1 +; CHECK-TF-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-TF-NEXT: [[C:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[C]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: @@ -144,19 +225,19 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDEX2_0:%.*]] = add i64 [[INDEX2]], 0 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], i64 [[INDEX2_0]] -; CHECK-NEXT: [[NEXT_GEP_0:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[NEXT_GEP_0]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP16]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[NEXT_GEP_0]] to * -; CHECK-NEXT: store [[TMP17]], * [[TMP18]], align 1 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP6]] to * +; CHECK-NEXT: store [[TMP8]], * [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] @@ -168,8 +249,8 @@ ; CHECK-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = load i8, i8* [[PTR_PHI]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP24]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[PTR_PHI]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP13]], 1 ; CHECK-NEXT: store i8 [[ADD]], i8* [[PTR_PHI]], align 1 ; CHECK-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1 ; CHECK-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i8* [[PTR_PHI_NEXT]], [[START]] @@ -178,6 +259,64 @@ ; CHECK: end: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @pointer_induction( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = sub i64 -1, [[TMP0]] +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-TF-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; CHECK-TF-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-TF-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], 1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP9]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[START:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-TF-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0nxv2i8(* [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP11]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2i8.p0nxv2i8( [[TMP13]], * [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX2]], [[TMP16]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP0]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1 +; CHECK-TF-NEXT: [[TMP19:%.*]] = load i8, i8* [[PTR_PHI]], align 1 +; CHECK-TF-NEXT: [[ADD:%.*]] = add i8 [[TMP19]], 1 +; CHECK-TF-NEXT: store i8 [[ADD]], i8* [[PTR_PHI]], align 1 +; CHECK-TF-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1 +; CHECK-TF-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i8* [[PTR_PHI_NEXT]], [[START]] +; CHECK-TF-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDEX]], [[N]] +; CHECK-TF-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: end: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S < %s | FileCheck %s +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF ; Ensure that we can vectorize loops such as: ; int *ptr = c; @@ -103,6 +105,110 @@ ; CHECK: for.exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @widen_ptr_phi_unrolled( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[TMP9]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP11]], i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[C]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-TF-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP12]], 4 +; CHECK-TF-NEXT: [[TMP15:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[VECTOR_GEP:%.*]] = shl [[TMP15]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-TF-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT5:%.*]] = shufflevector [[DOTSPLATINSERT4]], poison, zeroinitializer +; CHECK-TF-NEXT: [[TMP17:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[TMP18:%.*]] = add [[DOTSPLAT5]], [[TMP17]] +; CHECK-TF-NEXT: [[VECTOR_GEP6:%.*]] = shl [[TMP18]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[VECTOR_GEP6]] +; CHECK-TF-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, [[TMP16]], i64 1 +; CHECK-TF-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, [[TMP19]], i64 1 +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP16]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK3]], undef) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP21]], i32 4, [[ACTIVE_LANE_MASK3]], undef) +; CHECK-TF-NEXT: [[TMP22:%.*]] = add nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP23:%.*]] = add nsw [[WIDE_MASKED_GATHER7]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP22]], * [[TMP25]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP27:%.*]] = shl nuw nsw i32 [[TMP26]], 2 +; CHECK-TF-NEXT: [[TMP28:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 [[TMP28]] +; CHECK-TF-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP23]], * [[TMP30]], i32 4, [[ACTIVE_LANE_MASK3]]) +; CHECK-TF-NEXT: [[TMP31:%.*]] = add nsw [[WIDE_MASKED_GATHER8]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP32:%.*]] = add nsw [[WIDE_MASKED_GATHER9]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP31]], * [[TMP34]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP36:%.*]] = shl nuw nsw i32 [[TMP35]], 2 +; CHECK-TF-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i64 +; CHECK-TF-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i64 [[TMP37]] +; CHECK-TF-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP32]], * [[TMP39]], i32 4, [[ACTIVE_LANE_MASK3]]) +; CHECK-TF-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP41:%.*]] = shl nuw nsw i64 [[TMP40]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP41]] +; CHECK-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP43:%.*]] = shl nuw nsw i64 [[TMP42]], 2 +; CHECK-TF-NEXT: [[TMP44:%.*]] = add i64 [[INDEX_NEXT]], [[TMP43]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP44]], i64 [[N]]) +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[TMP45:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP45]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[C]], [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[PTR_014:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 1 +; CHECK-TF-NEXT: [[TMP46:%.*]] = load i32, i32* [[PTR_014]], align 4 +; CHECK-TF-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 2 +; CHECK-TF-NEXT: [[TMP47:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-TF-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP46]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_013]] +; CHECK-TF-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP47]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_013]] +; CHECK-TF-NEXT: store i32 [[ADD2]], i32* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: for.exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -152,7 +258,7 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -187,12 +293,12 @@ ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32* [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[S_010:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[D_09:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[D_09:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[S_010]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP21]], 1 ; CHECK-NEXT: store i32 [[MUL]], i32* [[D_09]], align 4 @@ -204,6 +310,85 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @widen_2ptrs_phi_unrolled( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[IND_END2:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = bitcast i32* [[NEXT_GEP]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP13:%.*]] = shl nuw nsw i32 [[TMP12]], 2 +; CHECK-TF-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP16]], i32 4, [[ACTIVE_LANE_MASK5]], poison) +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP18:%.*]] = shl nsw [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP19:%.*]] = bitcast i32* [[NEXT_GEP7]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP17]], * [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP21:%.*]] = shl nuw nsw i32 [[TMP20]], 2 +; CHECK-TF-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-TF-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[NEXT_GEP7]], i64 [[TMP22]] +; CHECK-TF-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP18]], * [[TMP24]], i32 4, [[ACTIVE_LANE_MASK5]]) +; CHECK-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[TMP25]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP26]] +; CHECK-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP27]], 2 +; CHECK-TF-NEXT: [[TMP29:%.*]] = add i64 [[INDEX_NEXT]], [[TMP28]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP29]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP30:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[S_010:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[D_09:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[TMP31:%.*]] = load i32, i32* [[S_010]], align 4 +; CHECK-TF-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP31]], 1 +; CHECK-TF-NEXT: store i32 [[MUL]], i32* [[D_09]], align 4 +; CHECK-TF-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[D_09]], i64 1 +; CHECK-TF-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[S_010]], i64 1 +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -246,7 +431,7 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i32*, i32** [[B:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i32*, i32** [[B:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -276,13 +461,13 @@ ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32** [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32** [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[P:%.*]] = phi i32* [ [[VAR3:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[Q:%.*]] = phi i32** [ [[VAR4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[Q:%.*]] = phi i32** [ [[VAR4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[VAR0:%.*]] = phi i32 [ [[VAR2:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[VAR1:%.*]] = load i32, i32* [[P]], align 8 ; CHECK-NEXT: [[VAR2]] = add i32 [[VAR1]], [[VAR0]] @@ -296,6 +481,47 @@ ; CHECK-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[VAR5]] ; +; CHECK-TF-LABEL: @pointer_iv_mixed( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[SMAX]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[A:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[TMP2]] +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32*, i32** [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[BC:%.*]] = bitcast [[TMP3]] to *> +; CHECK-TF-NEXT: [[TMP4:%.*]] = extractelement *> [[BC]], i64 0 +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0nxv2i32(* [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]], zeroinitializer) +; CHECK-TF-NEXT: [[TMP5:%.*]] = bitcast i32** [[NEXT_GEP]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2p0i32.p0nxv2p0i32( [[TMP3]], * [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP6]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 1 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[SMAX]]) +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP1]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP9]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[TMP6]]) +; CHECK-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: for.end: +; CHECK-TF-NEXT: [[VAR5:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-TF-NEXT: ret i32 [[VAR5]] +; entry: br label %for.body @@ -369,6 +595,56 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @phi_used_in_vector_compare_and_scalar_indvar_update_and_store( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[PTR:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i16* [ [[PTR]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 +; CHECK-TF-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = getelementptr i16, i16* [[POINTER_PHI]], [[TMP4]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = icmp ne [[TMP5]], zeroinitializer +; CHECK-TF-NEXT: [[BC:%.*]] = bitcast [[TMP5]] to *> +; CHECK-TF-NEXT: [[TMP7:%.*]] = extractelement *> [[BC]], i64 0 +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2i16.p0nxv2i16( zeroinitializer, * [[TMP7]], i32 2, [[TMP6]]) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-TF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-TF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[IV_PTR:%.*]] = phi i16* [ [[INCDEC_IV_PTR:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i16* [[IV_PTR]], null +; CHECK-TF-NEXT: br i1 [[CMP_I_NOT]], label [[IF_END]], label [[IF_END_SINK_SPLIT:%.*]] +; CHECK-TF: if.end.sink.split: +; CHECK-TF-NEXT: store i16 0, i16* [[IV_PTR]], align 2 +; CHECK-TF-NEXT: br label [[IF_END]] +; CHECK-TF: if.end: +; CHECK-TF-NEXT: [[INCDEC_IV_PTR]] = getelementptr inbounds i16, i16* [[IV_PTR]], i64 1 +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp ult i64 [[IV]], 1023 +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-TF: for.end: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -12,25 +12,20 @@ ; The test checks if the mask is being correctly created, reverted and used ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" -define void @vector_reverse_mask_v4i1(double* %a, double* %cond, i64 %N) #0 { +define void @vector_reverse_mask_v4i1(double* noalias %a, double* noalias %cond, i64 %N) #0 { ; CHECK-LABEL: @vector_reverse_mask_v4i1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[N]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[COND:%.*]], i64 [[N]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP4]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[COND]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; CHECK-NEXT: [[IND_END:%.*]] = and i64 [[N]], 7 @@ -39,42 +34,42 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[INDEX]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[N]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[COND:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -3 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 8 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -4 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 -3 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8, !alias.scope !0 -; CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_LOAD6]], <4 x double> poison, <4 x i32> +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fcmp une <4 x double> [[REVERSE7]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP9:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, double* [[TMP10]], i64 -3 -; CHECK-NEXT: [[REVERSE8:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP12]], i32 8, <4 x i1> [[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP12]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP10]], i64 -4 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, double* [[TMP13]], i64 -3 -; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[REVERSE5]], <4 x double> poison) ; CHECK-NEXT: [[TMP16:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], -; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], +; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], ; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP16]], <4 x double>* [[TMP18]], i32 8, <4 x i1> [[REVERSE8]]), !alias.scope !3, !noalias !0 +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP16]], <4 x double>* [[TMP18]], i32 8, <4 x i1> [[REVERSE3]]) ; CHECK-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP14]] to <4 x double>* -; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP17]], <4 x double>* [[TMP19]], i32 8, <4 x i1> [[REVERSE10]]), !alias.scope !3, !noalias !0 +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP17]], <4 x double>* [[TMP19]], i32 8, <4 x i1> [[REVERSE5]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -95,7 +90,76 @@ ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; +; CHECK-TF-LABEL: @vector_reverse_mask_v4i1( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 4, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[N]] +; CHECK-TF-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[COND:%.*]], i64 [[TMP1]] +; CHECK-TF-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP4]], i32 8, <4 x i1> [[REVERSE]], <4 x double> poison) +; CHECK-TF-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -4 +; CHECK-TF-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP7]], i32 8, <4 x i1> [[REVERSE4]], <4 x double> poison) +; CHECK-TF-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD5]], <4 x double> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP8:%.*]] = fcmp une <4 x double> [[REVERSE3]], zeroinitializer +; CHECK-TF-NEXT: [[TMP9:%.*]] = fcmp une <4 x double> [[REVERSE6]], zeroinitializer +; CHECK-TF-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[TMP1]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer +; CHECK-TF-NEXT: [[TMP12:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP10]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP14]], i32 8, <4 x i1> [[REVERSE7]], <4 x double> poison) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[TMP10]], i64 -4 +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP15]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison) +; CHECK-TF-NEXT: [[TMP18:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD8]], +; CHECK-TF-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], +; CHECK-TF-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; CHECK-TF-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP18]], <4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE7]]) +; CHECK-TF-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; CHECK-TF-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP19]], <4 x double>* [[TMP21]], i32 8, <4 x i1> [[REVERSE10]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-TF-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 12 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP22]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll @@ -6,14 +6,16 @@ ; a[i] = b[i] + 1.0; ; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 { ; CHECK-LABEL: vector_reverse_f64 ; CHECK-LABEL: vector.body ; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0 ; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7 -; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>* -; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8 +; CHECK: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>* +; CHECK-NEXT: %[[WIDE:.*]] = {{.*}}load{{.*}}<8 x double>* %[[CAST]] ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]] ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}} @@ -21,7 +23,7 @@ ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7 ; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>* -; CHECK-NEXT: store <8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]], align 8 +; CHECK-NEXT: store{{.*}}<8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]] entry: %cmp7 = icmp sgt i64 %N, 0 @@ -47,8 +49,8 @@ ; CHECK-LABEL: vector.body ; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0 ; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7 -; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>* -; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8 +; CHECK: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>* +; CHECK-NEXT: %[[WIDE:.*]] = {{.*}}load{{.*}}<8 x i64>* %[[CAST]] ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]] ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}} @@ -56,7 +58,7 @@ ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7 ; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>* -; CHECK-NEXT: store <8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]], align 8 +; CHECK-NEXT: store{{.*}}<8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]] entry: %cmp8 = icmp sgt i64 %N, 0