Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll @@ -1,6 +1,8 @@ ; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \ ; RUN: -pass-remarks-missed=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S 2>%t | FileCheck %s -check-prefix=CHECK ; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \ +; RUN: -pass-remarks-missed=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S | FileCheck %s --check-prefix=CHECK-TF ; Reduction can be vectorized @@ -17,6 +19,22 @@ ; CHECK: middle.block: ; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32( %[[ADD]]) + +; CHECK-TF-LABEL: @add +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[ADD1:.*]] = add %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = add %[[LOAD2]] +; CHECK-TF: %[[SEL_ADD1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[ADD1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_ADD2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[ADD2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ADD:.*]] = add %[[SEL_ADD2]], %[[SEL_ADD1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32( %[[ADD]]) entry: br label %for.body @@ -47,6 +65,22 @@ ; CHECK: middle.block: ; CHECK: %[[OR:.*]] = or %[[OR2]], %[[OR1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32( %[[OR]]) + +; CHECK-TF-LABEL: @or +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[OR1:.*]] = or %[[LOAD1]] +; CHECK-TF: %[[OR2:.*]] = or %[[LOAD2]] +; CHECK-TF: %[[SEL_OR1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[OR1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_OR2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[OR2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[OR:.*]] = or %[[SEL_OR2]], %[[SEL_OR1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32( %[[OR]]) entry: br label %for.body @@ -77,6 +111,22 @@ ; CHECK: middle.block: ; CHECK: %[[ABD:.*]] = and %[[ADD2]], %[[AND1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32( %[[ADD]]) + +; CHECK-TF-LABEL: @and +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[AND1:.*]] = and %[[LOAD1]] +; CHECK-TF: %[[AND2:.*]] = and %[[LOAD2]] +; CHECK-TF: %[[SEL_AND1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[AND1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_AND2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[AND2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[AND:.*]] = and %[[SEL_AND2]], %[[SEL_AND1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32( %[[AND]]) entry: br label %for.body @@ -107,6 +157,22 @@ ; CHECK: middle.block: ; CHECK: %[[XOR:.*]] = xor %[[XOR2]], %[[XOR1]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32( %[[XOR]]) + +; CHECK-TF-LABEL: @xor +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[XOR1:.*]] = xor %[[LOAD1]] +; CHECK-TF: %[[XOR2:.*]] = xor %[[LOAD2]] +; CHECK-TF: %[[SEL_XOR1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[XOR1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_XOR2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[XOR2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[XOR:.*]] = xor %[[SEL_XOR2]], %[[SEL_XOR1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32( %[[XOR]]) entry: br label %for.body @@ -140,6 +206,25 @@ ; CHECK: %[[ICMP:.*]] = icmp slt %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32( %[[SEL]]) + +; CHECK-TF-LABEL: @smin +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[ICMP1:.*]] = icmp slt %[[LOAD1]] +; CHECK-TF: %[[ICMP2:.*]] = icmp slt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ICMP:.*]] = icmp slt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32( %[[SEL]]) entry: br label %for.body @@ -174,6 +259,25 @@ ; CHECK: %[[ICMP:.*]] = icmp ugt %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32( %[[SEL]]) + +; CHECK-TF-LABEL: @umax +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8i32 +; CHECK-TF: %[[ICMP1:.*]] = icmp ugt %[[LOAD1]] +; CHECK-TF: %[[ICMP2:.*]] = icmp ugt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ICMP:.*]] = icmp ugt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32( %[[SEL]]) entry: br label %for.body @@ -205,6 +309,22 @@ ; CHECK: middle.block: ; CHECK: %[[ADD:.*]] = fadd fast %[[ADD2]], %[[ADD1]] ; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[ADD]]) + +; CHECK-TF-LABEL: @fadd_fast +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[ADD1:.*]] = fadd fast %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = fadd fast %[[LOAD2]] +; CHECK-TF: %[[SEL_ADD1:.*]] = select fast %[[ACTIVE_LANE_MASK1]], %[[ADD1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_ADD2:.*]] = select fast %[[ACTIVE_LANE_MASK2]], %[[ADD2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ADD:.*]] = fadd fast %[[SEL_ADD2]], %[[SEL_ADD1]] +; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[ADD]]) entry: br label %for.body @@ -234,6 +354,22 @@ ; CHECK: middle.block: ; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]] ; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]]) + +; CHECK-TF-LABEL: @fadd_fast_bfloat +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi <8 x i1> +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi <8 x i1> +; CHECK-TF: %[[VEC_PHI1:.*]] = phi <8 x bfloat> +; CHECK-TF: %[[VEC_PHI2:.*]] = phi <8 x bfloat> +; CHECK-TF: %[[LOAD1:.*]] = call <8 x bfloat> @llvm.masked.load.v8bf16 +; CHECK-TF: %[[LOAD2:.*]] = call <8 x bfloat> @llvm.masked.load.v8bf16 +; CHECK-TF: %[[ADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]] +; CHECK-TF: %[[SEL_ADD1:.*]] = select fast <8 x i1> %[[ACTIVE_LANE_MASK1]], <8 x bfloat> %[[ADD1]], <8 x bfloat> %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_ADD2:.*]] = select fast <8 x i1> %[[ACTIVE_LANE_MASK2]], <8 x bfloat> %[[ADD2]], <8 x bfloat> %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[ADD:.*]] = fadd fast <8 x bfloat> %[[SEL_ADD2]], %[[SEL_ADD1]] +; CHECK-TF-NEXT: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[ADD]]) entry: br label %for.body @@ -267,6 +403,25 @@ ; CHECK: %[[FCMP:.*]] = fcmp fast olt %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: call fast float @llvm.vector.reduce.fmin.nxv8f32( %[[SEL]]) + +; CHECK-TF-LABEL: @fmin_fast +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[FCMP1:.*]] = fcmp fast olt %[[LOAD1]] +; CHECK-TF: %[[FCMP2:.*]] = fcmp fast olt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select fast %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select fast %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[FCMP:.*]] = fcmp fast olt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fmin.nxv8f32( %[[SEL]]) entry: br label %for.body @@ -301,6 +456,25 @@ ; CHECK: %[[FCMP:.*]] = fcmp fast ogt %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL1]], %[[SEL2]] ; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32( %[[SEL]]) + +; CHECK-TF-LABEL: @fmax_fast +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi +; CHECK-TF: %[[VEC_PHI1:.*]] = phi +; CHECK-TF: %[[VEC_PHI2:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[LOAD2:.*]] = call @llvm.masked.load.nxv8f32 +; CHECK-TF: %[[FCMP1:.*]] = fcmp fast ogt %[[LOAD1]] +; CHECK-TF: %[[FCMP2:.*]] = fcmp fast ogt %[[LOAD2]] +; CHECK-TF: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] +; CHECK-TF: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] +; CHECK-TF: %[[SEL_SEL1:.*]] = select fast %[[ACTIVE_LANE_MASK1]], %[[SEL1]], %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_SEL2:.*]] = select fast %[[ACTIVE_LANE_MASK2]], %[[SEL2]], %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[FCMP:.*]] = fcmp fast ogt %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL_SEL1]], %[[SEL_SEL2]] +; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32( %[[SEL]]) entry: br label %for.body @@ -333,6 +507,17 @@ ; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] ; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[ADD]]) ; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4 + +; CHECK-TF-LABEL: @invariant_store +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF: %[[ADD1:.*]] = add %{{.*}}, %[[LOAD1]] +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK1]], %[[ADD1]], +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( %[[SEL]]) +; CHECK-TF-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4 + entry: %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42 store i32 0, i32* %gep.dst, align 4 @@ -368,6 +553,22 @@ ; CHECK: middle.block: ; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] ; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) + +; CHECK-TF-LABEL: @mul +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi <2 x i1> +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi <2 x i1> +; CHECK-TF: %[[VEC_PHI1:.*]] = phi <2 x i32> +; CHECK-TF: %[[VEC_PHI2:.*]] = phi <2 x i32> +; CHECK-TF: %[[LOAD1:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD2:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[MUL1:.*]] = mul <2 x i32> %[[LOAD1]] +; CHECK-TF: %[[MUL2:.*]] = mul <2 x i32> %[[LOAD2]] +; CHECK-TF: %[[SEL_MUL1:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK1]], <2 x i32> %[[MUL1]], <2 x i32> %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_MUL2:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK2]], <2 x i32> %[[MUL2]], <2 x i32> %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[RDX:.*]] = mul <2 x i32> %[[SEL_MUL2]], %[[SEL_MUL1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %[[RDX]]) entry: br label %for.body @@ -402,6 +603,26 @@ ; CHECK: middle.block: ; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] ; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) + +; CHECK-TF-LABEL: @memory_dependence +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK1:.*]] = phi <2 x i1> +; CHECK-TF: %[[ACTIVE_LANE_MASK2:.*]] = phi <2 x i1> +; CHECK-TF: %[[VEC_PHI1:.*]] = phi <2 x i32> +; CHECK-TF: %[[VEC_PHI2:.*]] = phi <2 x i32> +; CHECK-TF: %[[LOAD1:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD2:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD3:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[LOAD4:.*]] = call <2 x i32> @llvm.masked.load.v2i32 +; CHECK-TF: %[[ADD1:.*]] = add nsw <2 x i32> %[[LOAD3]], %[[LOAD1]] +; CHECK-TF: %[[ADD2:.*]] = add nsw <2 x i32> %[[LOAD4]], %[[LOAD2]] +; CHECK-TF: %[[MUL1:.*]] = mul <2 x i32> %[[LOAD3]] +; CHECK-TF: %[[MUL2:.*]] = mul <2 x i32> %[[LOAD4]] +; CHECK-TF: %[[SEL_MUL1:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK1]], <2 x i32> %[[MUL1]], <2 x i32> %[[VEC_PHI1]] +; CHECK-TF: %[[SEL_MUL2:.*]] = select <2 x i1> %[[ACTIVE_LANE_MASK2]], <2 x i32> %[[MUL2]], <2 x i32> %[[VEC_PHI2]] +; CHECK-TF: middle.block: +; CHECK-TF: %[[RDX:.*]] = mul <2 x i32> %[[SEL_MUL2]], %[[SEL_MUL1]] +; CHECK-TF-NEXT: call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %[[RDX]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-basic-vec.ll @@ -1,5 +1,7 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s -S | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s -S | FileCheck %s --check-prefix=CHECK-TF target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -14,6 +16,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i32 10, i32 0), poison, zeroinitializer) ; CHECK: store [[TMP2]], * {{.*}}, align 4 ; +; CHECK-TF-LABEL: @cmpsel_i32( +; CHECK-TF-NEXT: entry: +; CHECK-TF: vector.body: +; CHECK-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF-NEXT: [[TMP1:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i32 10, i32 0), poison, zeroinitializer) +; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP2]] +; entry: br label %for.body @@ -45,6 +55,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, float 1.000000e+01, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) ; CHECK: store [[TMP2]], * {{.*}}, align 4 +; CHECK-TF-LABEL: @cmpsel_f32( +; CHECK-TF-NEXT: entry: +; CHECK-TF: vector.body: +; CHECK-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv4f32 +; CHECK-TF-NEXT: [[TMP1:%.*]] = fcmp ogt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP2:%.*]] = select [[TMP1]], shufflevector ( insertelement ( poison, float 1.000000e+01, i32 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, float 2.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-TF: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP2]] +; entry: br label %for.body @@ -72,6 +90,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = fneg [[WIDE_LOAD]] ; CHECK: store [[TMP1]], * {{.*}}, align 4 +; CHECK-TF-LABEL: @fneg_f32( +; CHECK-TF-NEXT: entry: +; CHECK-TF: vector.body: +; CHECK-TF: [[WIDE_LOAD:%.*]] = call @llvm.masked.load.nxv4f32 +; CHECK-TF-NEXT: [[TMP1:%.*]] = fneg [[WIDE_LOAD]] +; CHECK-TF: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[TMP1]] entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -S %s -o - | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s -o - | FileCheck %s --check-prefix=CHECK-TF define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) #0 { ; CHECK-LABEL: @cond_inv_load_i32i32i16( @@ -58,6 +60,69 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @cond_inv_load_i32i32i16( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[BROADCAST_SPLAT]], i32 2, [[TMP12]], undef) +; CHECK-TF-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP13]], * [[TMP15]], i32 4, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[I_07]] +; CHECK-TF-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP19]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[TMP20:%.*]] = load i16, i16* [[INV]], align 2 +; CHECK-TF-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_07]] +; CHECK-TF-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX1]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -138,6 +203,67 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @cond_inv_load_f64f64f64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[COND:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f64.p0nxv4f64(* [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = fcmp ogt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 4.000000e-01, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f64.nxv4p0f64( [[BROADCAST_SPLAT]], i32 8, [[TMP12]], undef) +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f64.p0nxv4f64( [[WIDE_MASKED_GATHER]], * [[TMP14]], i32 8, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[I_08]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[CMP1:%.*]] = fcmp ogt double [[TMP18]], 4.000000e-01 +; CHECK-TF-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[TMP19:%.*]] = load double, double* [[INV]], align 8 +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] +; CHECK-TF-NEXT: store double [[TMP19]], double* [[ARRAYIDX2]], align 8 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -226,6 +352,76 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @invariant_load_cond( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 42 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32* [[TMP9]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[B]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP13]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP15]], i32 4, [[TMP14]], poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[DOTSPLAT]], i32 4, [[TMP14]], undef) +; CHECK-TF-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD1]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP16]], * [[TMP18]], i32 4, [[TMP14]]) +; CHECK-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP20:%.*]] = shl nuw nsw i64 [[TMP19]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]] +; CHECK-TF-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP22]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 42 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]] +; CHECK-TF-NEXT: [[TMP23:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-TF-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]] +; CHECK-TF-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]] +; CHECK-TF-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: for.end: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -mattr=+sve \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s -S | FileCheck %s +; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s -S | FileCheck %s --check-prefix=CHECK-TF target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" @@ -78,6 +80,76 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; +; CHECK-TF-LABEL: @fneg( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[S2:%.*]] = ptrtoint ptr [[S:%.*]] to i64 +; CHECK-TF-NEXT: [[D1:%.*]] = ptrtoint ptr [[D:%.*]] to i64 +; CHECK-TF-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-TF-NEXT: [[TMP0:%.*]] = sub i64 -1, [[WIDE_TRIP_COUNT]] +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 +; CHECK-TF-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP2]]) +; CHECK-TF-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] +; CHECK-TF-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-TF: vector.memcheck: +; CHECK-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = sub i64 [[D1]], [[S2]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; CHECK-TF-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-TF-NEXT: [[TMP13:%.*]] = sub i64 [[TMP12]], 1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP13]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0 +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, ptr [[TMP15]], i32 0 +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f16.p0(ptr [[TMP16]], i32 2, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP17:%.*]] = fneg [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i32 0 +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv8f16.p0( [[TMP17]], ptr [[TMP19]], i32 2, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 8 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 +; CHECK-TF-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP24:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-TF-NEXT: [[FNEG:%.*]] = fneg half [[TMP24]] +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: store half [[FNEG]], ptr [[ARRAYIDX2]], align 2 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll @@ -1,11 +1,15 @@ ; REQUIRES: asserts ; RUN: opt -loop-vectorize -mcpu=neoverse-v1 -disable-output %s -debug \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -mcpu=neoverse-v1 -disable-output %s -debug \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue 2>&1 | FileCheck %s --check-prefix=CHECK-TF target triple="aarch64--linux-gnu" ; CHECK: LV: Checking a loop in 'gather_nxv4i32_loaded_index' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %1 = load float, float* %arrayidx3, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_loaded_index' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %1 = load float, float* %arrayidx3, align 4 define void @gather_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { entry: br label %for.body @@ -28,6 +32,8 @@ ; CHECK: LV: Checking a loop in 'scatter_nxv4i32_loaded_index' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %1, float* %arrayidx5, align 4 +; CHECK-TF: LV: Checking a loop in 'scatter_nxv4i32_loaded_index' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %1, float* %arrayidx5, align 4 define void @scatter_nxv4i32_loaded_index(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { entry: br label %for.body @@ -52,6 +58,8 @@ ; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and a cost of 1. ; CHECK: LV: Checking a loop in 'gather_nxv4i32_unknown_stride' ; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_unknown_stride' +; CHECK-TF: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 define void @gather_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 { entry: br label %for.body @@ -75,6 +83,8 @@ ; to ensure the stride value is always 1. Therefore, it can assume a contiguous load and cost is 1. ; CHECK: LV: Checking a loop in 'scatter_nxv4i32_unknown_stride' ; CHECK: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 +; CHECK-TF: LV: Checking a loop in 'scatter_nxv4i32_unknown_stride' +; CHECK-TF: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 define void @scatter_nxv4i32_unknown_stride(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %stride, i64 %n) #0 { entry: br label %for.body @@ -96,6 +106,8 @@ ; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride2' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_stride2' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 define void @gather_nxv4i32_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body @@ -139,6 +151,8 @@ ; CHECK: LV: Checking a loop in 'gather_nxv4i32_stride64' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 +; CHECK-TF: LV: Checking a loop in 'gather_nxv4i32_stride64' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: %0 = load float, float* %arrayidx, align 4 define void @gather_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body @@ -160,6 +174,8 @@ ; CHECK: LV: Checking a loop in 'scatter_nxv4i32_stride64' ; CHECK: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 +; CHECK-TF: LV: Checking a loop in 'scatter_nxv4i32_stride64' +; CHECK-TF: LV: Found an estimated cost of 81 for VF vscale x 4 For instruction: store float %0, float* %arrayidx2, align 4 define void @scatter_nxv4i32_stride64(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -S %s -force-target-instruction-cost=1 -o - | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s \ +; RUN: -force-target-instruction-cost=1 -o - | FileCheck %s --check-prefix=CHECK-TF define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { ; CHECK-LABEL: @gather_nxv4i32_ind64( @@ -50,6 +53,60 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @gather_nxv4i32_ind64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i64.p0nxv4i64(* [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[WIDE_MASKED_GATHER]], * [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: store float [[TMP18]], float* [[ARRAYIDX5]], align 4 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -122,6 +179,62 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @scatter_nxv4i32_ind32( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], [[TMP13]] +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( [[WIDE_MASKED_LOAD]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[IDXPROM4:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-TF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IDXPROM4]] +; CHECK-TF-NEXT: store float [[TMP18]], float* [[ARRAYIDX5]], align 4 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -191,6 +304,62 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @scatter_inv_nxv4i32( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), [[BROADCAST_SPLAT]], i32 4, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP15]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: store i32 3, i32* [[INV]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -266,6 +435,65 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @gather_inv_nxv4i32( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32* [[INV:%.*]], i64 0 +; CHECK-TF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP10]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP11:%.*]] = icmp sgt [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], i32 4, [[TMP12]], undef) +; CHECK-TF-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP9]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_GATHER]], * [[TMP13]], i32 4, [[TMP12]]) +; CHECK-TF-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[TMP17]], 3 +; CHECK-TF-NEXT: br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[TMP18:%.*]] = load i32, i32* [[INV]], align 4 +; CHECK-TF-NEXT: store i32 [[TMP18]], i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -354,6 +582,64 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @gather_nxv4i32_ind64_stride2( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP12:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], [[TMP12]] +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( [[WIDE_MASKED_GATHER]], * [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP17]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_STRIDE2]] +; CHECK-TF-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; CHECK-TF-NEXT: store float [[TMP19]], float* [[ARRAYIDX2]], align 4 +; CHECK-TF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll @@ -1,6 +1,8 @@ ; REQUIRES: asserts ; RUN: opt -loop-vectorize -S < %s -debug -prefer-predicate-over-epilogue=scalar-epilogue 2>%t | FileCheck %s ; RUN: cat %t | FileCheck %s --check-prefix=DEBUG +; RUN: opt -loop-vectorize -S < %s -debug -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG target triple = "aarch64-unknown-linux-gnu" @@ -13,23 +15,23 @@ ; CHECK: vector.ph: ; CHECK: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i8() ; CHECK: [[TMP5:%.*]] = trunc [[TMP4]] to -; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; CHECK: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i7 1, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = add [[VEC_IND]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[EXT:%.+]] = zext [[TMP11]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to * -; CHECK-NEXT: store [[EXT]], * [[TMP14]], align 8 +; CHECK-NEXT: store{{.*}} [[EXT]], * [[TMP14]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add {{.*}}i64 [[INDEX]], [[TMP16]] +; CHECK: [[VEC_IND_NEXT]] = add [[VEC_IND]], ; entry: br label %for.body @@ -65,17 +67,17 @@ ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = zext [[VEC_IND]] to ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to * -; CHECK-NEXT: store [[TMP10]], * [[TMP14]], align 8 +; CHECK-NEXT: store{{.*}} [[TMP10]], * [[TMP14]] ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], +; CHECK-NEXT: [[INDEX_NEXT]] = add {{.*}}i64 [[INDEX]], [[TMP16]] +; CHECK: [[VEC_IND_NEXT]] = add [[VEC_IND]], ; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -dce -instcombine \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s -S | FileCheck %s +; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -dce -instcombine \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s -S | FileCheck %s --check-prefix=CHECK-TF target triple = "aarch64-linux-gnu" @@ -69,6 +71,71 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @cond_ind64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP12:%.*]] = trunc [[VEC_IND]] to +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; CHECK-TF-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP13]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP15]], i32 4, [[TMP14]], poison) +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[WIDE_MASKED_LOAD]], * [[TMP17]], i32 4, [[TMP14]]) +; CHECK-TF-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP19:%.*]] = shl i64 [[TMP18]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-TF-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP20]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[AND:%.*]] = and i64 [[I_08]], 1 +; CHECK-TF-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[AND]], 0 +; CHECK-TF-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_08]] +; CHECK-TF-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]] +; CHECK-TF-NEXT: store i32 [[TMP21]], i32* [[ARRAYIDX1]], align 4 +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-loads.ll @@ -1,5 +1,7 @@ ; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF define void @invariant_load(i64 %n, i32* noalias nocapture %a, i32* nocapture readonly %b) { ; CHECK-LABEL: @invariant_load @@ -11,6 +13,16 @@ ; CHECK: %[[LOAD:.*]] = load , * ; CHECK-NEXT: %[[ADD:.*]] = add nsw %[[SPLAT]], %[[LOAD]] ; CHECK: store %[[ADD]], * + +; CHECK-TF-LABEL: @invariant_load +; CHECK-TF: vector.body: +; CHECK-TF: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42 +; CHECK-TF-NEXT: %[[INVLOAD:.*]] = load i32, i32* %[[GEP]] +; CHECK-TF-NEXT: %[[SPLATINS:.*]] = insertelement poison, i32 %[[INVLOAD]], i32 0 +; CHECK-TF-NEXT: %[[SPLAT:.*]] = shufflevector %[[SPLATINS]], poison, zeroinitializer +; CHECK-TF: %[[LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* +; CHECK-TF-NEXT: %[[ADD:.*]] = add nsw %[[SPLAT]], %[[LOAD]] +; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[ADD]] entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -S -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: opt -loop-vectorize -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF target triple = "aarch64-unknown-linux-gnu" @@ -48,6 +49,14 @@ ; CHECK: for.inc24: ; CHECK-NEXT: ret void ; + +; CHECK-TF-LABEL: @inv_store_i16( +; CHECK-TF: vector.ph: +; CHECK-TF: %[[TMP1:.*]] = insertelement poison, i16* %dst, i32 0 +; CHECK-TF-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-TF: vector.body: +; CHECK-TF: %[[VECLOAD:.*]] = call @llvm.masked.load.nxv4i16.p0nxv4i16( +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %[[VECLOAD]] entry: br label %for.body14 @@ -116,6 +125,17 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; + +; CHECK-TF-LABEL: @cond_inv_store_i32( +; CHECK-TF: vector.ph: +; CHECK-TF: %[[TMP1:.*]] = insertelement poison, i32* %dst, i32 0 +; CHECK-TF-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[VECLOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32( +; CHECK-TF-NEXT: %[[MASK:.*]] = icmp sgt %[[VECLOAD]], zeroinitializer +; CHECK-TF: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VECLOAD]], %[[SPLAT_PTRS]], i32 4, %[[SEL]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue <%s | FileCheck %s +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue <%s | FileCheck %s --check-prefix=CHECK-TF define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) #0 { ; CHECK-LABEL: @stride7_i32( @@ -10,6 +12,15 @@ ; CHECK-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %[[PTRS]] ; CHECK-NEXT: %[[VALS:.*]] = add nsw %[[GLOAD]], ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VALS]], %[[PTRS]] + +; CHECK-TF-LABEL: @stride7_i32( +; CHECK-TF: vector.body +; CHECK-TF: %[[VEC_IND:.*]] = phi [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-TF-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 7, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: %[[PTRS:.*]] = getelementptr inbounds i32, i32* %dst, %[[PTR_INDICES]] +; CHECK-TF-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %[[PTRS]] +; CHECK-TF-NEXT: %[[VALS:.*]] = add nsw %[[GLOAD]], +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VALS]], %[[PTRS]] entry: br label %for.body @@ -37,6 +48,15 @@ ; CHECK-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], ; CHECK-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], + +; CHECK-TF-LABEL: @stride7_f64( +; CHECK-TF: vector.body +; CHECK-TF: %[[VEC_IND:.*]] = phi [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] +; CHECK-TF-NEXT: %[[PTR_INDICES:.*]] = mul nuw nsw %[[VEC_IND]], shufflevector ( insertelement ( poison, i64 7, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, %[[PTR_INDICES]] +; CHECK-TF-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], +; CHECK-TF-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], entry: br label %for.body @@ -64,6 +84,16 @@ ; CHECK-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], i32 8, %[[MASK]] ; CHECK-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], i32 8, %[[MASK]]) + +; CHECK-TF-LABEL: @cond_stride7_f64( +; CHECK-TF: vector.body +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[MASK:.*]] = icmp ne +; CHECK-TF: %[[PTRS:.*]] = getelementptr inbounds double, double* %dst, %{{.*}} +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: %[[GLOAD:.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %[[PTRS]], i32 8, %[[SEL]] +; CHECK-TF-NEXT: %[[VALS:.*]] = fadd %[[GLOAD]], +; CHECK-TF-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %[[VALS]], %[[PTRS]], i32 8, %[[SEL]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll @@ -1,5 +1,7 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -S %s -o - | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s -o - | FileCheck %s --check-prefix=CHECK-TF define void @mloadstore_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) { ; CHECK-LABEL: @mloadstore_f32 @@ -12,6 +14,19 @@ ; CHECK-NEXT: %[[FADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] ; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast float* %[[GEPA]] to * ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[MASK]]) + +; CHECK-TF-LABEL: @mloadstore_f32 +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4f32 +; CHECK-TF-NEXT: %[[MASK:.*]] = fcmp ogt %[[LOAD1]], +; CHECK-TF-NEXT: %[[GEPA:.*]] = getelementptr float, float* %a, +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to * +; CHECK-TF-NEXT: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* %[[MLOAD_PTRS]], i32 4, %[[SEL]] +; CHECK-TF-NEXT: %[[FADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] +; CHECK-TF-NEXT: %[[MSTORE_PTRS:.*]] = bitcast float* %[[GEPA]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[SEL]]) entry: br label %for.body @@ -49,6 +64,19 @@ ; CHECK-NEXT: %[[FADD:.*]] = add %[[LOAD1]], %[[LOAD2]] ; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast i32* %[[GEPA]] to * ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[MASK]]) + +; CHECK-TF-LABEL: @mloadstore_i32 +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[LOAD1:.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF-NEXT: %[[MASK:.*]] = icmp ne %[[LOAD1]], +; CHECK-TF-NEXT: %[[GEPA:.*]] = getelementptr i32, i32* %a, +; CHECK-TF-NEXT: %[[SEL:.*]] = select %[[ACTIVE_LANE_MASK]], %[[MASK]], zeroinitializer +; CHECK-TF-NEXT: %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to * +; CHECK-TF-NEXT: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %[[MLOAD_PTRS]], i32 4, %[[SEL]] +; CHECK-TF-NEXT: %[[FADD:.*]] = add %[[LOAD1]], %[[LOAD2]] +; CHECK-TF-NEXT: %[[MSTORE_PTRS:.*]] = bitcast i32* %[[GEPA]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[FADD]], * %[[MSTORE_PTRS]], i32 4, %[[SEL]]) entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll @@ -2,6 +2,8 @@ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 ; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 +; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF-VF4IC1 target triple = "aarch64-linux-gnu" @@ -41,6 +43,18 @@ ; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL7]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) ; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) ; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 + +; CHECK-TF-VF4IC1-LABEL: @select_const_i32_from_icmp +; CHECK-TF-VF4IC1: vector.body: +; CHECK-TF-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-TF-VF4IC1: [[VEC_LOAD:%.*]] = call @llvm.masked.load.nxv4i32 +; CHECK-TF-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-TF-VF4IC1-NEXT: [[VEC_SEL1:%.*]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i32 0), poison, zeroinitializer) +; CHECK-TF-VF4IC1-NEXT: [[VEC_SEL2:%.*]] = select %active.lane.mask, [[VEC_SEL1]], [[VEC_PHI]] +; CHECK-TF-VF4IC1: middle.block: +; CHECK-TF-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL2]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-TF-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-TF-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 entry: br label %for.body @@ -80,6 +94,9 @@ ; CHECK-VF4IC4-LABEL: @select_i32_from_icmp ; CHECK-VF4IC4: vector.body: + +; CHECK-TF-VF4IC1-LABEL: @select_i32_from_icmp +; CHECK-TF-VF4IC1: vector.body: entry: br label %for.body @@ -112,6 +129,9 @@ ; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp ; CHECK-VF4IC4: vector.body: + +; CHECK-TF-VF4IC1-LABEL: @select_const_i32_from_fcmp +; CHECK-TF-VF4IC1: vector.body: entry: br label %for.body @@ -135,6 +155,8 @@ ; CHECK-VF4IC1-NOT: vector.body ; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp ; CHECK-VF4IC4-NOT: vector.body +; CHECK-TF-VF4IC1-LABEL: @select_const_f32_from_icmp +; CHECK-TF-VF4IC1-NOT: vector.body entry: br label %for.body @@ -170,6 +192,9 @@ ; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp ; CHECK-VF4IC4: vector.body: + +; CHECK-TF-VF4IC1-LABEL: @pred_select_const_i32_from_icmp +; CHECK-TF-VF4IC1: vector.body: entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -4,6 +4,9 @@ ; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -force-vector-interleave=1 \ ; RUN: -mcpu=neoverse-n2 -S 2>&1 | FileCheck %s --check-prefix=CHECK-CPU-NEOVERSE-N2 +; RUN: opt < %s -loop-vectorize -debug -disable-output -force-ordered-reductions=true -hints-allow-reordering=false \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-interleave=1 \ +; RUN: -S 2>&1 | FileCheck %s --check-prefix=CHECK-TF target triple="aarch64-unknown-linux-gnu" @@ -11,6 +14,8 @@ ; CHECK: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 ; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07 ; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-TF: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07 +; CHECK-TF: Found an estimated cost of 16 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { entry: @@ -33,6 +38,7 @@ ; CHECK: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07 ; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07 +; CHECK-TF: Found an estimated cost of 8 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07 define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll @@ -12,12 +12,15 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 { ; CHECK-LABEL: vector.body: +; CHECK: %{{.*}} = fcmp une ; CHECK: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) ; CHECK: %[[WIDEMSKLOAD:.*]] = call @llvm.masked.load.nxv4f64.p0nxv4f64(* %{{.*}}, i32 8, %[[REVERSE6]], poison) ; CHECK-NEXT: %[[FADD:.*]] = fadd %[[WIDEMSKLOAD]] Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -7,6 +7,8 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF define void @vector_reverse_f64(i64 %N, double* noalias %a, double* noalias %b) #0{ ; CHECK-LABEL: @vector_reverse_f64( @@ -71,6 +73,78 @@ ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; +; CHECK-TF-LABEL: @vector_reverse_f64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 3 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 3 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC_NEG:%.*]] = sub i64 [[N_MOD_VF]], [[N_RND_UP]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC_NEG]], [[N]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-TF-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], [[N]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP12]], -8 +; CHECK-TF-NEXT: [[TMP13:%.*]] = or i32 [[DOTNEG]], 1 +; CHECK-TF-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP11]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f64.p0nxv8f64(* nonnull [[TMP16]], i32 8, [[REVERSE]], poison) +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG4:%.*]] = mul i32 [[TMP19]], -8 +; CHECK-TF-NEXT: [[TMP20:%.*]] = or i32 [[DOTNEG4]], 1 +; CHECK-TF-NEXT: [[TMP21:%.*]] = sext i32 [[TMP20]] to i64 +; CHECK-TF-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP17]], i64 [[TMP21]] +; CHECK-TF-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv8f64.p0nxv8f64( [[TMP18]], * [[TMP23]], i32 8, [[REVERSE3]]) +; CHECK-TF-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP25:%.*]] = shl i64 [[TMP24]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP25]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP26:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP26]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[I_08]] +; CHECK-TF-NEXT: [[TMP27:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[ADD:%.*]] = fadd double [[TMP27]], 1.000000e+00 +; CHECK-TF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] +; CHECK-TF-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8 +; CHECK-TF-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 +; CHECK-TF-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; entry: %cmp7 = icmp sgt i64 %N, 0 br i1 %cmp7, label %for.body, label %for.cond.cleanup @@ -165,6 +239,89 @@ ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; +; CHECK-TF-LABEL: @vector_reverse_i64( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[A2:%.*]] = ptrtoint i64* [[A:%.*]] to i64 +; CHECK-TF-NEXT: [[B1:%.*]] = ptrtoint i64* [[B:%.*]] to i64 +; CHECK-TF-NEXT: [[CMP8:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-TF: vector.memcheck: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 6 +; CHECK-TF-NEXT: [[TMP6:%.*]] = shl i64 [[N]], 3 +; CHECK-TF-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], [[B1]] +; CHECK-TF-NEXT: [[TMP8:%.*]] = add i64 [[TMP6]], [[A2]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]] +; CHECK-TF-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP5]] +; CHECK-TF-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 3 +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 3 +; CHECK-TF-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP14]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP11]] +; CHECK-TF-NEXT: [[N_VEC_NEG:%.*]] = sub i64 [[N_MOD_VF]], [[N_RND_UP]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC_NEG]], [[N]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP15:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-TF-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], [[N]] +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP16]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP18]], -8 +; CHECK-TF-NEXT: [[TMP19:%.*]] = or i32 [[DOTNEG]], 1 +; CHECK-TF-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; CHECK-TF-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, i64* [[TMP17]], i64 [[TMP20]] +; CHECK-TF-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP22:%.*]] = bitcast i64* [[TMP21]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i64.p0nxv8i64(* nonnull [[TMP22]], i32 8, [[REVERSE]], poison) +; CHECK-TF-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP16]] +; CHECK-TF-NEXT: [[TMP24:%.*]] = add [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[DOTNEG6:%.*]] = mul i32 [[TMP25]], -8 +; CHECK-TF-NEXT: [[TMP26:%.*]] = or i32 [[DOTNEG6]], 1 +; CHECK-TF-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64 +; CHECK-TF-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP23]], i64 [[TMP27]] +; CHECK-TF-NEXT: [[REVERSE5:%.*]] = call @llvm.experimental.vector.reverse.nxv8i1( [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP29:%.*]] = bitcast i64* [[TMP28]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv8i64.p0nxv8i64( [[TMP24]], * [[TMP29]], i32 8, [[REVERSE5]]) +; CHECK-TF-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP31:%.*]] = shl i64 [[TMP30]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP31]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP32:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP32]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[I_09]] = add nsw i64 [[I_09_IN]], -1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_09]] +; CHECK-TF-NEXT: [[TMP33:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-TF-NEXT: [[ADD:%.*]] = add i64 [[TMP33]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]] +; CHECK-TF-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 +; CHECK-TF-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 +; CHECK-TF-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] +; entry: %cmp8 = icmp sgt i64 %N, 0 br i1 %cmp8, label %for.body, label %for.cond.cleanup Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -2,6 +2,8 @@ ; REQUIRES: asserts ; RUN: opt -loop-vectorize -S -mtriple=aarch64 -mattr=+sve -debug-only=loop-vectorize \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -S -mtriple=aarch64 -mattr=+sve \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s 2>&1 | FileCheck %s --check-prefix=CHECK-TF target triple = "aarch64-unknown-linux-gnu" @@ -105,6 +107,85 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @pointer_induction_used_as_vector( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]] +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; CHECK-TF-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i8*, i8** [[START_1:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[IND_END2:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP9]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 1 +; CHECK-TF-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]] +; CHECK-TF-NEXT: [[TMP14:%.*]] = mul i64 [[TMP11]], 0 +; CHECK-TF-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP14]], i32 0 +; CHECK-TF-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-TF-NEXT: [[TMP15:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = add [[DOTSPLAT]], [[TMP15]] +; CHECK-TF-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP16]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP17:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, [[TMP17]], i64 1 +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 +; CHECK-TF-NEXT: [[TMP20:%.*]] = bitcast i8** [[TMP19]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2p0i8.p0nxv2p0i8( [[TMP18]], * [[TMP20]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = getelementptr i8, i8* [[TMP21]], i32 0 +; CHECK-TF-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP22]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0nxv2i8(* [[TMP23]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP24:%.*]] = add [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP22]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2i8.p0nxv2i8( [[TMP24]], * [[TMP25]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP27]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP28:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP13]] +; CHECK-TF-NEXT: [[TMP29:%.*]] = extractelement [[TMP28]], i32 0 +; CHECK-TF-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8** [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START_1]], [[ENTRY]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START_2]], [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK-TF: loop.body: +; CHECK-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-TF-NEXT: [[PTR_IV_1:%.*]] = phi i8** [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[PTR_IV_1_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-TF-NEXT: [[PTR_IV_2:%.*]] = phi i8* [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[PTR_IV_2_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-TF-NEXT: [[PTR_IV_1_NEXT]] = getelementptr inbounds i8*, i8** [[PTR_IV_1]], i64 1 +; CHECK-TF-NEXT: [[PTR_IV_2_NEXT]] = getelementptr inbounds i8, i8* [[PTR_IV_2]], i64 1 +; CHECK-TF-NEXT: store i8* [[PTR_IV_2_NEXT]], i8** [[PTR_IV_1]], align 8 +; CHECK-TF-NEXT: [[LV:%.*]] = load i8, i8* [[PTR_IV_2]], align 1 +; CHECK-TF-NEXT: [[ADD:%.*]] = add i8 [[LV]], 1 +; CHECK-TF-NEXT: store i8 [[ADD]], i8* [[PTR_IV_2]], align 1 +; CHECK-TF-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 +; CHECK-TF-NEXT: [[C:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]] +; CHECK-TF-NEXT: br i1 [[C]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-TF: exit: +; CHECK-TF-NEXT: ret void +; entry: @@ -179,6 +260,64 @@ ; CHECK: end: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @pointer_induction( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = sub i64 -1, [[TMP0]] +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-TF-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; CHECK-TF-NEXT: br i1 [[TMP4]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-TF-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], 1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP9]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[START:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP10:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP10]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-TF-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i8.p0nxv2i8(* [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP11]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2i8.p0nxv2i8( [[TMP13]], * [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX2]], [[TMP16]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP0]]) +; CHECK-TF-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 +; CHECK-TF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1 +; CHECK-TF-NEXT: [[TMP19:%.*]] = load i8, i8* [[PTR_PHI]], align 1 +; CHECK-TF-NEXT: [[ADD:%.*]] = add i8 [[TMP19]], 1 +; CHECK-TF-NEXT: store i8 [[ADD]], i8* [[PTR_PHI]], align 1 +; CHECK-TF-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1 +; CHECK-TF-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i8* [[PTR_PHI_NEXT]], [[START]] +; CHECK-TF-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDEX]], [[N]] +; CHECK-TF-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: end: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF ; Ensure that we can vectorize loops such as: ; int *ptr = c; @@ -104,6 +106,110 @@ ; CHECK: for.exit: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @widen_ptr_phi_unrolled( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 [[TMP9]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP11]], i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[C]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY2]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-TF-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP12]], 4 +; CHECK-TF-NEXT: [[TMP15:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[VECTOR_GEP:%.*]] = shl [[TMP15]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-TF-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; CHECK-TF-NEXT: [[DOTSPLAT5:%.*]] = shufflevector [[DOTSPLATINSERT4]], poison, zeroinitializer +; CHECK-TF-NEXT: [[TMP17:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-TF-NEXT: [[TMP18:%.*]] = add [[DOTSPLAT5]], [[TMP17]] +; CHECK-TF-NEXT: [[VECTOR_GEP6:%.*]] = shl [[TMP18]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[VECTOR_GEP6]] +; CHECK-TF-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, [[TMP16]], i64 1 +; CHECK-TF-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, [[TMP19]], i64 1 +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP16]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK3]], undef) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]], undef) +; CHECK-TF-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP21]], i32 4, [[ACTIVE_LANE_MASK3]], undef) +; CHECK-TF-NEXT: [[TMP22:%.*]] = add nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP23:%.*]] = add nsw [[WIDE_MASKED_GATHER7]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP22]], * [[TMP25]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP27:%.*]] = shl nuw nsw i32 [[TMP26]], 2 +; CHECK-TF-NEXT: [[TMP28:%.*]] = zext i32 [[TMP27]] to i64 +; CHECK-TF-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 [[TMP28]] +; CHECK-TF-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP23]], * [[TMP30]], i32 4, [[ACTIVE_LANE_MASK3]]) +; CHECK-TF-NEXT: [[TMP31:%.*]] = add nsw [[WIDE_MASKED_GATHER8]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP32:%.*]] = add nsw [[WIDE_MASKED_GATHER9]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP31]], * [[TMP34]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP36:%.*]] = shl nuw nsw i32 [[TMP35]], 2 +; CHECK-TF-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i64 +; CHECK-TF-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i64 [[TMP37]] +; CHECK-TF-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP32]], * [[TMP39]], i32 4, [[ACTIVE_LANE_MASK3]]) +; CHECK-TF-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP41:%.*]] = shl nuw nsw i64 [[TMP40]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP41]] +; CHECK-TF-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP43:%.*]] = shl nuw nsw i64 [[TMP42]], 2 +; CHECK-TF-NEXT: [[TMP44:%.*]] = add i64 [[INDEX_NEXT]], [[TMP43]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP44]], i64 [[N]]) +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[TMP45:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP45]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[C]], [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[PTR_014:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 1 +; CHECK-TF-NEXT: [[TMP46:%.*]] = load i32, i32* [[PTR_014]], align 4 +; CHECK-TF-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[PTR_014]], i64 2 +; CHECK-TF-NEXT: [[TMP47:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-TF-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP46]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_013]] +; CHECK-TF-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 +; CHECK-TF-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP47]], 1 +; CHECK-TF-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_013]] +; CHECK-TF-NEXT: store i32 [[ADD2]], i32* [[ARRAYIDX3]], align 4 +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: for.exit: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -205,6 +311,85 @@ ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @widen_2ptrs_phi_unrolled( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[N:%.*]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 +; CHECK-TF-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-TF-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3 +; CHECK-TF-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], -1 +; CHECK-TF-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP8]], [[N]] +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[IND_END2:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY4]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT10:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = bitcast i32* [[NEXT_GEP]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP13:%.*]] = shl nuw nsw i32 [[TMP12]], 2 +; CHECK-TF-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i64 [[TMP14]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to * +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* [[TMP16]], i32 4, [[ACTIVE_LANE_MASK5]], poison) +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP18:%.*]] = shl nsw [[WIDE_MASKED_LOAD9]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-TF-NEXT: [[TMP19:%.*]] = bitcast i32* [[NEXT_GEP7]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP17]], * [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-TF-NEXT: [[TMP21:%.*]] = shl nuw nsw i32 [[TMP20]], 2 +; CHECK-TF-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-TF-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[NEXT_GEP7]], i64 [[TMP22]] +; CHECK-TF-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[TMP18]], * [[TMP24]], i32 4, [[ACTIVE_LANE_MASK5]]) +; CHECK-TF-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[TMP25]], 3 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP26]] +; CHECK-TF-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP27]], 2 +; CHECK-TF-NEXT: [[TMP29:%.*]] = add i64 [[INDEX_NEXT]], [[TMP28]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT10]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP29]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP30:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP30]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[S_010:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[D_09:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[TMP31:%.*]] = load i32, i32* [[S_010]], align 4 +; CHECK-TF-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP31]], 1 +; CHECK-TF-NEXT: store i32 [[MUL]], i32* [[D_09]], align 4 +; CHECK-TF-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[D_09]], i64 1 +; CHECK-TF-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[S_010]], i64 1 +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body @@ -297,6 +482,47 @@ ; CHECK-NEXT: [[VAR5:%.*]] = phi i32 [ [[VAR2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[VAR5]] ; +; CHECK-TF-LABEL: @pointer_iv_mixed( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[SMAX]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[A:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[TMP2]] +; CHECK-TF-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32*, i32** [[B:%.*]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[BC:%.*]] = bitcast [[TMP3]] to *> +; CHECK-TF-NEXT: [[TMP4:%.*]] = extractelement *> [[BC]], i64 0 +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0nxv2i32(* [[TMP4]], i32 8, [[ACTIVE_LANE_MASK]], zeroinitializer) +; CHECK-TF-NEXT: [[TMP5:%.*]] = bitcast i32** [[NEXT_GEP]] to * +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2p0i32.p0nxv2p0i32( [[TMP3]], * [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: [[TMP6]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-TF-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 1 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[SMAX]]) +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP1]] +; CHECK-TF-NEXT: [[TMP9:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP9]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[TMP6]]) +; CHECK-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-TF: for.end: +; CHECK-TF-NEXT: [[VAR5:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-TF-NEXT: ret i32 [[VAR5]] +; entry: br label %for.body @@ -370,6 +596,56 @@ ; CHECK: for.end: ; CHECK-NEXT: ret void ; +; CHECK-TF-LABEL: @phi_used_in_vector_compare_and_scalar_indvar_update_and_store( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +; CHECK-TF-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-TF-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-TF-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[PTR:%.*]], i64 [[N_VEC]] +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[POINTER_PHI:%.*]] = phi i16* [ [[PTR]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 1 +; CHECK-TF-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-TF-NEXT: [[TMP5:%.*]] = getelementptr i16, i16* [[POINTER_PHI]], [[TMP4]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = icmp ne [[TMP5]], zeroinitializer +; CHECK-TF-NEXT: [[BC:%.*]] = bitcast [[TMP5]] to *> +; CHECK-TF-NEXT: [[TMP7:%.*]] = extractelement *> [[BC]], i64 0 +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2i16.p0nxv2i16( zeroinitializer, * [[TMP7]], i32 2, [[TMP6]]) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 1 +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-TF-NEXT: [[PTR_IND]] = getelementptr i16, i16* [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-TF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-TF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-TF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[ENTRY]] ] +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.body: +; CHECK-TF-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[IV_PTR:%.*]] = phi i16* [ [[INCDEC_IV_PTR:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] +; CHECK-TF-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i16* [[IV_PTR]], null +; CHECK-TF-NEXT: br i1 [[CMP_I_NOT]], label [[IF_END]], label [[IF_END_SINK_SPLIT:%.*]] +; CHECK-TF: if.end.sink.split: +; CHECK-TF-NEXT: store i16 0, i16* [[IV_PTR]], align 2 +; CHECK-TF-NEXT: br label [[IF_END]] +; CHECK-TF: if.end: +; CHECK-TF-NEXT: [[INCDEC_IV_PTR]] = getelementptr inbounds i16, i16* [[IV_PTR]], i64 1 +; CHECK-TF-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-TF-NEXT: [[EXITCOND_NOT:%.*]] = icmp ult i64 [[IV]], 1023 +; CHECK-TF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-TF: for.end: +; CHECK-TF-NEXT: ret void +; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -13,6 +13,8 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s --check-prefix=CHECK-TF target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" @@ -91,6 +93,75 @@ ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; +; CHECK-TF-LABEL: @vector_reverse_mask_v4i1( +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-TF-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 4, i64 [[N]]) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT17:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP0:%.*]] = xor i64 [[INDEX]], -1 +; CHECK-TF-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[N]] +; CHECK-TF-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[COND:%.*]], i64 [[TMP1]] +; CHECK-TF-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP4]], i32 8, <4 x i1> [[REVERSE]], <4 x double> poison) +; CHECK-TF-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -4 +; CHECK-TF-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP7]], i32 8, <4 x i1> [[REVERSE4]], <4 x double> poison) +; CHECK-TF-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD5]], <4 x double> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP8:%.*]] = fcmp une <4 x double> [[REVERSE3]], zeroinitializer +; CHECK-TF-NEXT: [[TMP9:%.*]] = fcmp une <4 x double> [[REVERSE6]], zeroinitializer +; CHECK-TF-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[TMP1]] +; CHECK-TF-NEXT: [[TMP11:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer +; CHECK-TF-NEXT: [[TMP12:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer +; CHECK-TF-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP10]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP14]], i32 8, <4 x i1> [[REVERSE7]], <4 x double> poison) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[TMP10]], i64 -4 +; CHECK-TF-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP15]], i64 -3 +; CHECK-TF-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> +; CHECK-TF-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison) +; CHECK-TF-NEXT: [[TMP18:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD8]], +; CHECK-TF-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], +; CHECK-TF-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP13]] to <4 x double>* +; CHECK-TF-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP18]], <4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE7]]) +; CHECK-TF-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; CHECK-TF-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP19]], <4 x double>* [[TMP21]], i32 8, <4 x i1> [[REVERSE10]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-TF-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 12 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]]) +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT17]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP22]], i64 [[N]]) +; CHECK-TF-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-TF-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-TF: scalar.ph: +; CHECK-TF-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-TF: for.cond.cleanup.loopexit: +; CHECK-TF-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-TF: for.cond.cleanup: +; CHECK-TF-NEXT: ret void +; CHECK-TF: for.body: +; CHECK-TF-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; CHECK-TF: if.then: +; CHECK-TF-NEXT: br label [[FOR_INC]] +; CHECK-TF: for.inc: +; CHECK-TF-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; entry: %cmp7 = icmp sgt i64 %N, 0 Index: llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll @@ -7,14 +7,16 @@ ; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s +; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 { ; CHECK-LABEL: vector_reverse_f64 ; CHECK-LABEL: vector.body ; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0 ; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7 -; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>* -; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8 +; CHECK: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>* +; CHECK-NEXT: %[[WIDE:.*]] = {{.*}}load{{.*}}<8 x double>* %[[CAST]] ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]] ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}} @@ -22,7 +24,7 @@ ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7 ; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>* -; CHECK-NEXT: store <8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]], align 8 +; CHECK-NEXT: store{{.*}}<8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]] entry: %cmp7 = icmp sgt i64 %N, 0 @@ -48,8 +50,8 @@ ; CHECK-LABEL: vector.body ; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0 ; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7 -; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>* -; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8 +; CHECK: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>* +; CHECK-NEXT: %[[WIDE:.*]] = {{.*}}load{{.*}}<8 x i64>* %[[CAST]] ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]] ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}} @@ -57,7 +59,7 @@ ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7 ; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>* -; CHECK-NEXT: store <8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]], align 8 +; CHECK-NEXT: store{{.*}}<8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]] entry: %cmp8 = icmp sgt i64 %N, 0