diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -25,7 +25,7 @@ cl::desc( "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), - cl::init(1), cl::Hidden); + cl::init(2), cl::Hidden); static cl::opt SLPMaxVF( "riscv-v-slp-max-vf", diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll @@ -14,27 +14,30 @@ ; CHECK-LABEL: @vector_add( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -49,7 +52,7 @@ ; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -74,32 +77,35 @@ ; CHECK-LABEL: @vector_add_reduce( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5]] = add [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7]] = add [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP5]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -111,7 +117,7 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -11,27 +11,30 @@ ; CHECK-LABEL: @vector_udiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = udiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = udiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -54,26 +57,26 @@ ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: @@ -115,27 +118,30 @@ ; CHECK-LABEL: @vector_sdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = sdiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = sdiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -158,26 +164,26 @@ ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: @@ -219,27 +225,30 @@ ; CHECK-LABEL: @vector_urem( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = urem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = urem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -262,26 +271,26 @@ ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = urem <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = urem <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = urem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = urem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: @@ -323,27 +332,30 @@ ; CHECK-LABEL: @vector_srem( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = srem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = srem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -366,26 +378,26 @@ ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = srem <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; FIXED-NEXT: [[TMP7:%.*]] = srem <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; FIXED-NEXT: store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = srem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FIXED-NEXT: [[TMP7:%.*]] = srem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: @@ -427,31 +439,34 @@ ; CHECK-LABEL: @predicated_udiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select [[TMP5]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = udiv [[WIDE_LOAD]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP7]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = udiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -480,34 +495,34 @@ ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer -; FIXED-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> -; FIXED-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[BROADCAST_SPLAT3]], <2 x i64> -; FIXED-NEXT: [[TMP10:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], [[TMP8]] -; FIXED-NEXT: [[TMP11:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], [[TMP9]] -; FIXED-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP13:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP10]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI4:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP11]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT3]], zeroinitializer +; FIXED-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> +; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT3]], <4 x i64> +; FIXED-NEXT: [[TMP10:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[TMP8]] +; FIXED-NEXT: [[TMP11:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]] +; FIXED-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: @@ -561,31 +576,34 @@ ; CHECK-LABEL: @predicated_sdiv( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select [[TMP5]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = sdiv [[WIDE_LOAD]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP7]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -614,34 +632,34 @@ ; FIXED-NEXT: entry: ; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer -; FIXED-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> -; FIXED-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[BROADCAST_SPLAT3]], <2 x i64> -; FIXED-NEXT: [[TMP10:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], [[TMP8]] -; FIXED-NEXT: [[TMP11:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], [[TMP9]] -; FIXED-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP13:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP10]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI4:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP11]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT3]], zeroinitializer +; FIXED-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> +; FIXED-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[BROADCAST_SPLAT3]], <4 x i64> +; FIXED-NEXT: [[TMP10:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[TMP8]] +; FIXED-NEXT: [[TMP11:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP9]] +; FIXED-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP10]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP11]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI4]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: @@ -695,28 +713,31 @@ ; CHECK-LABEL: @predicated_udiv_by_constant( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP6:%.*]] = udiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP6]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = udiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -749,24 +770,24 @@ ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP8:%.*]] = udiv <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP9:%.*]] = udiv <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP9]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: @@ -820,28 +841,31 @@ ; CHECK-LABEL: @predicated_sdiv_by_constant( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP6:%.*]] = sdiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP7:%.*]] = xor [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP6]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = sdiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -874,24 +898,24 @@ ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP8:%.*]] = sdiv <2 x i64> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP9:%.*]] = sdiv <2 x i64> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP11:%.*]] = xor <2 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP9]], <2 x i64> [[WIDE_LOAD1]] -; FIXED-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; FIXED-NEXT: store <2 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP9:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP7]], <4 x i64> [[TMP9]], <4 x i64> [[WIDE_LOAD1]] +; FIXED-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; FIXED-NEXT: store <4 x i64> [[PREDPHI2]], ptr [[TMP5]], align 8 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FIXED: middle.block: @@ -945,12 +969,12 @@ ; CHECK-LABEL: @predicated_sdiv_by_minus_one( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -959,15 +983,15 @@ ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -1003,26 +1027,26 @@ ; FIXED: vector.body: ; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 32 ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16 -; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD]], -; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD1]], -; FIXED-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> , <16 x i8> -; FIXED-NEXT: [[TMP9:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> , <16 x i8> -; FIXED-NEXT: [[TMP10:%.*]] = sdiv <16 x i8> [[WIDE_LOAD]], [[TMP8]] -; FIXED-NEXT: [[TMP11:%.*]] = sdiv <16 x i8> [[WIDE_LOAD1]], [[TMP9]] -; FIXED-NEXT: [[TMP12:%.*]] = xor <16 x i1> [[TMP6]], -; FIXED-NEXT: [[TMP13:%.*]] = xor <16 x i1> [[TMP7]], -; FIXED-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP10]], <16 x i8> [[WIDE_LOAD]] -; FIXED-NEXT: [[PREDPHI2:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> [[TMP11]], <16 x i8> [[WIDE_LOAD1]] -; FIXED-NEXT: store <16 x i8> [[PREDPHI]], ptr [[TMP4]], align 1 -; FIXED-NEXT: store <16 x i8> [[PREDPHI2]], ptr [[TMP5]], align 1 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 32 +; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 +; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD]], +; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], +; FIXED-NEXT: [[TMP8:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> , <32 x i8> +; FIXED-NEXT: [[TMP9:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> , <32 x i8> +; FIXED-NEXT: [[TMP10:%.*]] = sdiv <32 x i8> [[WIDE_LOAD]], [[TMP8]] +; FIXED-NEXT: [[TMP11:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP9]] +; FIXED-NEXT: [[TMP12:%.*]] = xor <32 x i1> [[TMP6]], +; FIXED-NEXT: [[TMP13:%.*]] = xor <32 x i1> [[TMP7]], +; FIXED-NEXT: [[PREDPHI:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP10]], <32 x i8> [[WIDE_LOAD]] +; FIXED-NEXT: [[PREDPHI2:%.*]] = select <32 x i1> [[TMP7]], <32 x i8> [[TMP11]], <32 x i8> [[WIDE_LOAD1]] +; FIXED-NEXT: store <32 x i8> [[PREDPHI]], ptr [[TMP4]], align 1 +; FIXED-NEXT: store <32 x i8> [[PREDPHI2]], ptr [[TMP5]], align 1 +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; FIXED: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll @@ -102,31 +102,31 @@ ; CHECK-LABEL: @uniform_store_i1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[START]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x ptr> poison, ptr [[START]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT2]], <16 x ptr> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x ptr> poison, ptr [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x ptr> [[BROADCAST_SPLATINSERT]], <32 x ptr> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <32 x ptr> poison, ptr [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <32 x ptr> [[BROADCAST_SPLATINSERT2]], <32 x ptr> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, <16 x ptr> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <16 x ptr> [[TMP3]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <16 x ptr> [[TMP4]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <16 x ptr> [[TMP5]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 15 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <32 x ptr> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31 ; CHECK-NEXT: store i1 [[TMP8]], ptr [[DST:%.*]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -13,31 +13,31 @@ ; OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; OUTLOOP: for.body.preheader: ; OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2 +; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 ; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] ; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OUTLOOP: vector.ph: ; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2 +; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 ; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; OUTLOOP: vector.body: ; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 ; OUTLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] ; OUTLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 -; OUTLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to -; OUTLOOP-NEXT: [[TMP8]] = add [[VEC_PHI]], [[TMP7]] +; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; OUTLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; OUTLOOP-NEXT: [[TMP8]] = add [[VEC_PHI]], [[TMP7]] ; OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 2 +; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]] ; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; OUTLOOP: middle.block: -; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[TMP8]]) +; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP8]]) ; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; OUTLOOP: scalar.ph: @@ -67,12 +67,12 @@ ; INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; INLOOP: for.body.preheader: ; INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 +; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8 ; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] ; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INLOOP: vector.ph: ; INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 +; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 8 ; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] @@ -82,12 +82,12 @@ ; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 ; INLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] ; INLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 -; INLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to -; INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; INLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP7]]) ; INLOOP-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] ; INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 +; INLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 8 ; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]] ; INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -5,41 +5,41 @@ ; CHECK-LABEL: @load_store_factor2_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], [[TMP13]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP15]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], [[TMP13]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP15]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -140,46 +140,46 @@ ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = mul [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], [[TMP13]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP15]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P]], [[TMP16]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP18]], [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = mul [[VEC_IND]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP12]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P]], [[TMP13]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_GATHER1]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP15]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP13]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[P]], [[TMP16]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]], [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: @@ -414,39 +414,39 @@ ; CHECK-LABEL: @combine_load_factor2_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[P]], [[TMP12]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP14:%.*]] = add [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER1]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP14]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP10]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[P]], [[TMP12]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP14:%.*]] = add [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER1]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP14]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=LMUL1 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=1 | FileCheck %s -check-prefix=LMUL1 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=2 | FileCheck %s -check-prefix=LMUL2 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=4 | FileCheck %s -check-prefix=LMUL4 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S --riscv-v-register-bit-width-lmul=8 | FileCheck %s -check-prefix=LMUL8 +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=LMUL2 define void @load_store(ptr %p) { ; LMUL1-LABEL: @load_store( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -9,9 +9,9 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 ; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] @@ -20,18 +20,18 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP5]], i64 5) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP5]], i64 5) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP7]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP8:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv8i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP8]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0( [[TMP11]], ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP8]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP11]], ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -12,44 +12,44 @@ ; VLENUNK-LABEL: @test( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: ; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; VLENUNK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer -; VLENUNK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; VLENUNK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; VLENUNK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; VLENUNK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] ; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 -; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) ; VLENUNK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] ; VLENUNK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 -; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) -; VLENUNK-NEXT: [[TMP14:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[WIDE_MASKED_LOAD]] -; VLENUNK-NEXT: [[TMP15:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) +; VLENUNK-NEXT: [[TMP14:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[WIDE_MASKED_LOAD]] +; VLENUNK-NEXT: [[TMP15:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] ; VLENUNK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP10]] ; VLENUNK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 -; VLENUNK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 +; VLENUNK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 ; VLENUNK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] -; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLENUNK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -17,8 +17,9 @@ ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: ; RV32-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]]) -; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]] +; RV32-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; RV32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]] ; RV32-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV32: vector.memcheck: ; RV32-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 @@ -33,37 +34,40 @@ ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: -; RV32-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]] +; RV32-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; RV32-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]] ; RV32-NEXT: [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]] ; RV32-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 16 -; RV32-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; RV32-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; RV32-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) -; RV32-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; RV32-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP7:%.*]] = mul i64 16, [[TMP6]] -; RV32-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; RV32-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; RV32-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; RV32-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) +; RV32-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; RV32-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; RV32-NEXT: [[TMP10:%.*]] = mul i64 16, [[TMP9]] +; RV32-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; RV32-NEXT: br label [[VECTOR_BODY:%.*]] ; RV32: vector.body: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV32-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 -; RV32-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; RV32-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP10]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 -; RV32-NEXT: [[TMP12:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to -; RV32-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP12]] -; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] -; RV32-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 -; RV32-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; RV32-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV32-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV32-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 +; RV32-NEXT: [[TMP12:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; RV32-NEXT: [[TMP13:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; RV32-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP13]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP14]], i32 8, [[TMP12]], poison), !alias.scope !3 +; RV32-NEXT: [[TMP15:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to +; RV32-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP15]] +; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] +; RV32-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP16]], [[TMP17]], i32 8, [[TMP12]]), !alias.scope !5, !noalias !7 +; RV32-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; RV32-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV32-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; RV32: middle.block: ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -73,30 +77,31 @@ ; RV32: for.body: ; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; RV32-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 +; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 ; RV32-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; RV32: if.then: -; RV32-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]] -; RV32-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP17]] to double -; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]] +; RV32-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; RV32-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP22]] +; RV32-NEXT: [[TMP23:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; RV32-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP21]] to double +; RV32-NEXT: [[ADD:%.*]] = fadd double [[TMP23]], [[CONV]] ; RV32-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] ; RV32-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; RV32-NEXT: br label [[FOR_INC]] ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV32: for.end: ; RV32-NEXT: ret void ; ; RV64-LABEL: @foo4( ; RV64-NEXT: entry: ; RV64-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP0]]) -; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP1]] +; RV64-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 16, i64 [[TMP1]]) +; RV64-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]] ; RV64-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; RV64: vector.memcheck: ; RV64-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880 @@ -111,37 +116,40 @@ ; RV64-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] ; RV64-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV64: vector.ph: -; RV64-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP2]] +; RV64-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; RV64-NEXT: [[N_MOD_VF:%.*]] = urem i64 625, [[TMP4]] ; RV64-NEXT: [[N_VEC:%.*]] = sub i64 625, [[N_MOD_VF]] ; RV64-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 16 -; RV64-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; RV64-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; RV64-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) -; RV64-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; RV64-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP7:%.*]] = mul i64 16, [[TMP6]] -; RV64-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; RV64-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; RV64-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; RV64-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 16, i64 0), poison, zeroinitializer) +; RV64-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; RV64-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; RV64-NEXT: [[TMP10:%.*]] = mul i64 16, [[TMP9]] +; RV64-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; RV64-NEXT: br label [[VECTOR_BODY:%.*]] ; RV64: vector.body: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; RV64-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] -; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i32.nxv1p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 -; RV64-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) -; RV64-NEXT: [[TMP10:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP10]] -; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv1f64.nxv1p0( [[TMP11]], i32 8, [[TMP9]], poison), !alias.scope !3 -; RV64-NEXT: [[TMP12:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to -; RV64-NEXT: [[TMP13:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP12]] -; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] -; RV64-NEXT: call void @llvm.masked.scatter.nxv1f64.nxv1p0( [[TMP13]], [[TMP14]], i32 8, [[TMP9]]), !alias.scope !5, !noalias !7 -; RV64-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] -; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; RV64-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; RV64-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV64-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; RV64-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], [[VEC_IND]] +; RV64-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i32.nxv2p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !0 +; RV64-NEXT: [[TMP12:%.*]] = icmp slt [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; RV64-NEXT: [[TMP13:%.*]] = shl nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; RV64-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], [[TMP13]] +; RV64-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP14]], i32 8, [[TMP12]], poison), !alias.scope !3 +; RV64-NEXT: [[TMP15:%.*]] = sitofp [[WIDE_MASKED_GATHER]] to +; RV64-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP15]] +; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] +; RV64-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP16]], [[TMP17]], i32 8, [[TMP12]]), !alias.scope !5, !noalias !7 +; RV64-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; RV64-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; RV64: middle.block: ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -151,22 +159,22 @@ ; RV64: for.body: ; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; RV64-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 +; RV64-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; RV64-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 ; RV64-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; RV64: if.then: -; RV64-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP18]] -; RV64-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 -; RV64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP17]] to double -; RV64-NEXT: [[ADD:%.*]] = fadd double [[TMP19]], [[CONV]] +; RV64-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP22]] +; RV64-NEXT: [[TMP23:%.*]] = load double, ptr [[ARRAYIDX3]], align 8 +; RV64-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP21]] to double +; RV64-NEXT: [[ADD:%.*]] = fadd double [[TMP23]], [[CONV]] ; RV64-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] ; RV64-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8 ; RV64-NEXT: br label [[FOR_INC]] ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV64: for.end: ; RV64-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-interleaved.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple riscv64-linux-gnu \ ; RUN: -mattr=+v -debug-only=loop-vectorize \ @@ -10,6 +11,49 @@ ; Function Attrs: nofree norecurse nosync nounwind writeonly define dso_local void @foo(i32 signext %n, ptr nocapture %A) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @foo +; CHECK-SAME: (i32 signext [[N:%.*]], ptr nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: store <8 x i32> [[VEC_IND]], ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 8 +; CHECK-NEXT: store <8 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: store i32 [[TMP3]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll @@ -14,10 +14,10 @@ ; LMUL1-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1 ; LMUL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; LMUL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 ; LMUL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL1: vector.ph: -; LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 ; LMUL1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; LMUL1-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL1: vector.body: @@ -25,15 +25,15 @@ ; LMUL1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 ; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] ; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 +; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 ; LMUL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]] ; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4 -; LMUL1-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 +; LMUL1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] ; LMUL1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]] ; LMUL1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 -; LMUL1-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP10]], align 4 -; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; LMUL1-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP10]], align 4 +; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; LMUL1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; LMUL1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL1: middle.block: @@ -54,7 +54,7 @@ ; LMUL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; LMUL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; LMUL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] -; LMUL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; LMUL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; LMUL1: for.end.loopexit: ; LMUL1-NEXT: br label [[FOR_END]] ; LMUL1: for.end: @@ -108,7 +108,7 @@ ; LMUL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; LMUL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; LMUL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] -; LMUL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; LMUL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; LMUL2: for.end.loopexit: ; LMUL2-NEXT: br label [[FOR_END]] ; LMUL2: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -11,27 +11,30 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 200 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 200 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -47,7 +50,7 @@ ; CHECK-NEXT: store i64 [[V]], ptr [[A2]], align 32 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IV]], 199 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -82,12 +85,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 32 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 100 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: store <4 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: @@ -132,27 +135,30 @@ ; CHECK-LABEL: @trivial_due_max_vscale( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 8192 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 8192 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -195,27 +201,30 @@ ; CHECK-LABEL: @no_high_lmul_or_interleave( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 200, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 32 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1024 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 1024 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 200, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -16,27 +16,30 @@ ; VLENUNK-LABEL: @vector_add( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLENUNK-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLENUNK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -58,27 +61,30 @@ ; VLEN128-LABEL: @vector_add( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLEN128-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLEN128-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -121,27 +127,27 @@ ; VLENUNK-LABEL: @vector_add_i32( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: ; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] ; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 ; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -166,27 +172,27 @@ ; VLEN128-LABEL: @vector_add_i32( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: ; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] ; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 ; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -287,27 +293,30 @@ ; VLENUNK-LABEL: @indexed_store( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLENUNK-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; VLENUNK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLENUNK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLENUNK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -329,27 +338,30 @@ ; VLEN128-LABEL: @indexed_store( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLEN128-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; VLEN128-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; VLEN128-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLEN128-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -389,34 +401,37 @@ ; VLENUNK-LABEL: @indexed_load( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; VLENUNK-NEXT: [[TMP6]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; VLENUNK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLENUNK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; VLENUNK-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; VLENUNK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLENUNK: middle.block: -; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP6]]) +; VLENUNK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP8]]) ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLENUNK: scalar.ph: ; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: br label [[FOR_BODY:%.*]] ; VLENUNK: for.body: ; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -430,40 +445,43 @@ ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLENUNK: for.end: -; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; ; VLEN128-LABEL: @indexed_load( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] -; VLEN128-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; VLEN128-NEXT: [[TMP6]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; VLEN128-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLEN128-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] +; VLEN128-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; VLEN128-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; VLEN128-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VLEN128: middle.block: -; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP6]]) +; VLEN128-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP8]]) ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VLEN128: scalar.ph: ; VLEN128-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: br label [[FOR_BODY:%.*]] ; VLEN128: for.body: ; VLEN128-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -477,7 +495,7 @@ ; VLEN128-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; VLEN128-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLEN128: for.end: -; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; VLEN128-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; VLEN128-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: @@ -503,25 +521,28 @@ ; VLENUNK-LABEL: @splat_int( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLENUNK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLENUNK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -541,25 +562,28 @@ ; VLEN128-LABEL: @splat_int( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLEN128-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLEN128-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -595,25 +619,28 @@ ; VLENUNK-LABEL: @splat_ptr( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: -; VLENUNK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0 -; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLENUNK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLENUNK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -633,25 +660,28 @@ ; VLEN128-LABEL: @splat_ptr( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: -; VLEN128-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 +; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLEN128-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; VLEN128-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0 -; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; VLEN128-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; VLEN128-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; VLEN128-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on \ ; RUN: -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 \ ; RUN: -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \ @@ -12,15 +13,66 @@ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) define i32 @add(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @add -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[ADD1:.*]] = add %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[ADD:.*]] = add %[[ADD2]], %[[ADD1]] -; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32( %[[ADD]]) +; CHECK-LABEL: define i32 @add +; CHECK-SAME: (ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16]] = add [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = add [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP22]], [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ADD_LCSSA]] +; entry: br label %for.body @@ -42,15 +94,66 @@ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) define i32 @or(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @or -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[OR1:.*]] = or %[[LOAD1]] -; CHECK: %[[OR2:.*]] = or %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[OR:.*]] = or %[[OR2]], %[[OR1]] -; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32( %[[OR]]) +; CHECK-LABEL: define i32 @or +; CHECK-SAME: (ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16]] = or [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = or [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = or [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32( [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[OR]] = or i32 [[TMP22]], [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[OR_LCSSA]] +; entry: br label %for.body @@ -72,15 +175,66 @@ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) define i32 @and(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @and -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[AND1:.*]] = and %[[LOAD1]] -; CHECK: %[[AND2:.*]] = and %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[ABD:.*]] = and %[[ADD2]], %[[AND1]] -; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32( %[[ADD]]) +; CHECK-LABEL: define i32 @and +; CHECK-SAME: (ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, i32 -1, i64 0), poison, zeroinitializer), i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 -1, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16]] = and [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = and [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = and [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32( [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[AND]] = and i32 [[TMP22]], [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[AND_LCSSA]] +; entry: br label %for.body @@ -102,15 +256,66 @@ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) define i32 @xor(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @xor -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[XOR1:.*]] = xor %[[LOAD1]] -; CHECK: %[[XOR2:.*]] = xor %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[XOR:.*]] = xor %[[XOR2]], %[[XOR1]] -; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32( %[[XOR]]) +; CHECK-LABEL: define i32 @xor +; CHECK-SAME: (ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16]] = xor [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = xor [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = xor [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32( [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[XOR]] = xor i32 [[TMP22]], [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[XOR_LCSSA]] +; entry: br label %for.body @@ -132,18 +337,70 @@ ; SMIN define i32 @smin(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @smin -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[ICMP1:.*]] = icmp slt %[[LOAD1]] -; CHECK: %[[ICMP2:.*]] = icmp slt %[[LOAD2]] -; CHECK: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] -; CHECK: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[ICMP:.*]] = icmp slt %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32( %[[SEL]]) +; CHECK-LABEL: define i32 @smin +; CHECK-SAME: (ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp slt [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp slt [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18]] = select [[TMP16]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP19]] = select [[TMP17]], [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select [[RDX_MINMAX_CMP]], [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32( [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP24]], [[SUM_010]] +; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP24]], i32 [[SUM_010]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]] +; entry: br label %for.body @@ -166,18 +423,70 @@ ; UMAX define i32 @umax(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @umax -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[ICMP1:.*]] = icmp ugt %[[LOAD1]] -; CHECK: %[[ICMP2:.*]] = icmp ugt %[[LOAD2]] -; CHECK: %[[SEL1:.*]] = select %[[ICMP1]], %[[LOAD1]] -; CHECK: %[[SEL2:.*]] = select %[[ICMP2]], %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[ICMP:.*]] = icmp ugt %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: %[[SEL:.*]] = select %[[ICMP]], %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32( %[[SEL]]) +; CHECK-LABEL: define i32 @umax +; CHECK-SAME: (ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ugt [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18]] = select [[TMP16]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP19]] = select [[TMP17]], [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select [[RDX_MINMAX_CMP]], [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32( [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP24]], [[SUM_010]] +; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP24]], i32 [[SUM_010]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]] +; entry: br label %for.body @@ -200,15 +509,66 @@ ; FADD (FAST) define float @fadd_fast(ptr noalias nocapture readonly %a, i64 %n) { -; CHECK-LABEL: @fadd_fast -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[ADD1:.*]] = fadd fast %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = fadd fast %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[ADD:.*]] = fadd fast %[[ADD2]], %[[ADD1]] -; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, %[[ADD]]) +; CHECK-LABEL: define float @fadd_fast +; CHECK-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16]] = fadd fast [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17]] = fadd fast [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP21:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP22]], [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[ADD_LCSSA]] +; entry: br label %for.body @@ -229,15 +589,54 @@ ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. ; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2) define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) { -; CHECK-LABEL: @fadd_fast_bfloat -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <8 x bfloat> -; CHECK: %[[LOAD2:.*]] = load <8 x bfloat> -; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]] -; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]] -; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]]) +; CHECK-LABEL: define bfloat @fadd_fast_bfloat +; CHECK-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds bfloat, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x bfloat>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds bfloat, ptr [[TMP2]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x bfloat>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6]] = fadd fast <16 x bfloat> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7]] = fadd fast <16 x bfloat> [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x bfloat> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi bfloat [ 0xR0000, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi bfloat [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD]] = fadd fast bfloat [[TMP10]], [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi bfloat [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret bfloat [[ADD_LCSSA]] +; entry: br label %for.body @@ -259,18 +658,70 @@ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) define float @fmin_fast(ptr noalias nocapture readonly %a, i64 %n) #0 { -; CHECK-LABEL: @fmin_fast -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[FCMP1:.*]] = fcmp olt %[[LOAD1]] -; CHECK: %[[FCMP2:.*]] = fcmp olt %[[LOAD2]] -; CHECK: %[[SEL1:.*]] = select %[[FCMP1]], %[[LOAD1]] -; CHECK: %[[SEL2:.*]] = select %[[FCMP2]], %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[FCMP:.*]] = fcmp olt %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: %[[SEL:.*]] = select %[[FCMP]], %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32( %[[SEL]]) +; CHECK-LABEL: define float @fmin_fast +; CHECK-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = fcmp olt [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17:%.*]] = fcmp olt [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18]] = select [[TMP16]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP19]] = select [[TMP17]], [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select [[RDX_MINMAX_CMP]], [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = call float @llvm.vector.reduce.fmin.nxv8f32( [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt float [[TMP24]], [[SUM_07]] +; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP24]], float [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]] +; entry: br label %for.body @@ -293,18 +744,70 @@ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) define float @fmax_fast(ptr noalias nocapture readonly %a, i64 %n) #0 { -; CHECK-LABEL: @fmax_fast -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load -; CHECK: %[[LOAD2:.*]] = load -; CHECK: %[[FCMP1:.*]] = fcmp fast ogt %[[LOAD1]] -; CHECK: %[[FCMP2:.*]] = fcmp fast ogt %[[LOAD2]] -; CHECK: %[[SEL1:.*]] = select %[[FCMP1]], %[[LOAD1]] -; CHECK: %[[SEL2:.*]] = select %[[FCMP2]], %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[FCMP:.*]] = fcmp fast ogt %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: %[[SEL:.*]] = select fast %[[FCMP]], %[[SEL1]], %[[SEL2]] -; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32( %[[SEL]]) +; CHECK-LABEL: define float @fmax_fast +; CHECK-SAME: (ptr noalias nocapture readonly [[A:%.*]], i64 [[N:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast ogt [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17:%.*]] = fcmp fast ogt [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP18]] = select [[TMP16]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP19]] = select [[TMP17]], [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast [[RDX_MINMAX_CMP]], [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP23:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv8f32( [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast ogt float [[TMP24]], [[SUM_07]] +; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP24]], float [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]] +; entry: br label %for.body @@ -330,15 +833,54 @@ ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. ; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @mul(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @mul -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <4 x i32> -; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]] -; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK-LABEL: define i32 @mul +; CHECK-SAME: (ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6]] = mul <8 x i32> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <8 x i32> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP10]], [[SUM_07]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[MUL_LCSSA]] +; entry: br label %for.body @@ -360,19 +902,76 @@ ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. ; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2) define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) { -; CHECK-LABEL: @memory_dependence -; CHECK: vector.body: -; CHECK: %[[LOAD1:.*]] = load <4 x i32> -; CHECK: %[[LOAD2:.*]] = load <4 x i32> -; CHECK: %[[LOAD3:.*]] = load <4 x i32> -; CHECK: %[[LOAD4:.*]] = load <4 x i32> -; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]] -; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]] -; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]] -; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]] -; CHECK: middle.block: -; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]] -; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]]) +; CHECK-LABEL: define i32 @memory_dependence +; CHECK-SAME: (ptr noalias nocapture [[A:%.*]], ptr noalias nocapture readonly [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i32>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 8 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP0]], 32 +; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP1]], 32 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 +; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 8 +; CHECK-NEXT: store <8 x i32> [[TMP11]], ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18]] = mul <8 x i32> [[WIDE_LOAD3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP19]] = mul <8 x i32> [[WIDE_LOAD4]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <8 x i32> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[MUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i64 [[I]], 32 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD2]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP23]], [[SUM]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[MUL_LCSSA]] +; entry: br label %for.body @@ -398,17 +997,75 @@ ; CHECK-REMARK: vectorized loop (vectorization width: vscale x 2, interleaved count: 2) define float @fmuladd(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: @fmuladd( -; CHECK: vector.body: -; CHECK: [[WIDE_LOAD:%.*]] = load -; CHECK: [[WIDE_LOAD2:%.*]] = load -; CHECK: [[WIDE_LOAD3:%.*]] = load -; CHECK: [[WIDE_LOAD4:%.*]] = load -; CHECK: [[MULADD1:%.*]] = call reassoc @llvm.fmuladd.nxv2f32( [[WIDE_LOAD]], [[WIDE_LOAD3]], -; CHECK: [[MULADD2:%.*]] = call reassoc @llvm.fmuladd.nxv2f32( [[WIDE_LOAD2]], [[WIDE_LOAD4]], -; CHECK: middle.block: -; CHECK: [[BIN_RDX:%.*]] = fadd reassoc [[MULADD2]], [[MULADD1]] -; CHECK: call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float -0.000000e+00, [[BIN_RDX]]) +; CHECK-LABEL: define float @fmuladd +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP20]] +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP22]] = call reassoc @llvm.fmuladd.nxv4f32( [[WIDE_LOAD]], [[WIDE_LOAD3]], [[VEC_PHI]]) +; CHECK-NEXT: [[TMP23]] = call reassoc @llvm.fmuladd.nxv4f32( [[WIDE_LOAD2]], [[WIDE_LOAD4]], [[VEC_PHI1]]) +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP27:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP28:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP28]], float [[TMP29]], float [[SUM_07]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MULADD_LCSSA]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -12,27 +12,30 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[TMP6]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -75,27 +78,30 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -136,34 +142,37 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv1i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP7]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[TMP8]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP10]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -177,7 +186,7 @@ ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: @@ -205,25 +214,28 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -260,26 +272,29 @@ ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) ; CHECK-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -57,33 +57,22 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP4]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP5]], i32 4) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: call void @llvm.masked.store.nxv2i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -13,26 +13,29 @@ ; SCALABLE-LABEL: @uniform_load( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -58,17 +61,17 @@ ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8 -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXEDLEN: middle.block: @@ -93,26 +96,29 @@ ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = load i64, ptr [[B:%.*]], align 8 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 8 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -138,12 +144,12 @@ ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 8 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -183,26 +189,29 @@ ; SCALABLE-LABEL: @uniform_load_outside_use( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -218,7 +227,7 @@ ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: for.end: -; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; SCALABLE-NEXT: ret i64 [[V_LCSSA]] ; ; FIXEDLEN-LABEL: @uniform_load_outside_use( @@ -229,17 +238,17 @@ ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8 -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXEDLEN: middle.block: @@ -284,12 +293,12 @@ ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 8 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -331,39 +340,43 @@ ; SCALABLE-LABEL: @conditional_uniform_load( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; SCALABLE-NEXT: [[TMP3:%.*]] = add [[TMP2]], zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP4]] -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; SCALABLE-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], i32 8, [[TMP8]], poison) -; SCALABLE-NEXT: [[TMP9:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP8]], [[WIDE_MASKED_GATHER]], zeroinitializer -; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP11]], align 8 -; SCALABLE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP11]], poison) +; SCALABLE-NEXT: [[TMP12:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP11]], [[WIDE_MASKED_GATHER]], zeroinitializer +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]] +; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0 +; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP14]], align 8 +; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -391,33 +404,33 @@ ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT2]], <4 x ptr> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <2 x i64> [[STEP_ADD]], -; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[TMP2]], <2 x i64> poison) -; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> [[TMP3]], <2 x i64> poison) -; FIXEDLEN-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP2]], -; FIXEDLEN-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP3]], -; FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[WIDE_MASKED_GATHER]], <2 x i64> zeroinitializer -; FIXEDLEN-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[WIDE_MASKED_GATHER4]], <2 x i64> zeroinitializer +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison) +; FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT3]], i32 8, <4 x i1> [[TMP3]], <4 x i64> poison) +; FIXEDLEN-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP2]], +; FIXEDLEN-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP3]], +; FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer +; FIXEDLEN-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[WIDE_MASKED_GATHER4]], <4 x i64> zeroinitializer ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8 -; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[PREDPHI5]], ptr [[TMP9]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 8 +; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI5]], ptr [[TMP9]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], ; FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXEDLEN: middle.block: @@ -448,42 +461,46 @@ ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP9]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], i32 8, [[TMP10]], poison) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer -; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[WIDE_MASKED_GATHER]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] -; TF-SCALABLE-NEXT: [[TMP14:%.*]] = or [[TMP10]], [[TMP12]] -; TF-SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[PREDPHI]], ptr [[TMP15]], i32 8, [[TMP14]]) -; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] -; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; TF-SCALABLE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], i32 8, [[TMP13]], poison) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer +; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP13]], [[WIDE_MASKED_GATHER]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]] +; TF-SCALABLE-NEXT: [[TMP17:%.*]] = or [[TMP13]], [[TMP15]] +; TF-SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP18]], i32 8, [[TMP17]]) +; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] +; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; TF-SCALABLE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -510,22 +527,22 @@ ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[TMP1]], <2 x i64> poison) -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], -; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i64> [[WIDE_MASKED_GATHER]], <2 x i64> zeroinitializer +; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP1]], <4 x i64> poison) +; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], +; TF-FIXEDLEN-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP4]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -578,26 +595,29 @@ ; SCALABLE-LABEL: @uniform_load_unaligned( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP3:%.*]] = load i64, ptr [[B:%.*]], align 1 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 -; SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] -; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP5:%.*]] = load i64, ptr [[B:%.*]], align 1 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -623,17 +643,17 @@ ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 1 -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXEDLEN: middle.block: @@ -658,26 +678,29 @@ ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = load i64, ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP6]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 1 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -703,12 +726,12 @@ ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = load i64, ptr [[B:%.*]], align 1 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -748,26 +771,29 @@ ; SCALABLE-LABEL: @uniform_store( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -789,23 +815,23 @@ ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXEDLEN: middle.block: @@ -830,26 +856,29 @@ ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -870,8 +899,8 @@ ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -879,8 +908,8 @@ ; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -920,35 +949,40 @@ ; SCALABLE-LABEL: @uniform_store_of_loop_varying( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP3:%.*]] = add zeroinitializer, [[TMP2]] -; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT]], [[TMP4]] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() -; SCALABLE-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; SCALABLE-NEXT: [[TMP9:%.*]] = extractelement [[TMP5]], i32 [[TMP8]] -; SCALABLE-NEXT: store i64 [[TMP9]], ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP11]], align 8 -; SCALABLE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] -; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP5:%.*]] = add zeroinitializer, [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP7:%.*]] = add [[DOTSPLAT]], [[TMP6]] +; SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 +; SCALABLE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 2 +; SCALABLE-NEXT: [[TMP12:%.*]] = sub i32 [[TMP11]], 1 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractelement [[TMP7]], i32 [[TMP12]] +; SCALABLE-NEXT: store i64 [[TMP13]], ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP15]], align 8 +; SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] +; SCALABLE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -970,33 +1004,31 @@ ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT4]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT6]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> ) -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[STEP_ADD]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> ) -; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] -; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] -; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP4]], align 8 -; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT7]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], -; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; FIXEDLEN-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; FIXEDLEN-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; FIXEDLEN-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; FIXEDLEN-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; FIXEDLEN-NEXT: store i64 [[TMP7]], ptr [[B:%.*]], align 8 +; FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; FIXEDLEN-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8 +; FIXEDLEN-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP11]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXEDLEN-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXEDLEN-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXEDLEN: middle.block: ; FIXEDLEN-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; FIXEDLEN-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1019,38 +1051,42 @@ ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP9]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024) -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[VEC_IND]], [[BROADCAST_SPLAT]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT2]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]] -; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024) +; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[VEC_IND]], [[BROADCAST_SPLAT]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]] +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT2]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] +; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; TF-SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -1071,23 +1107,22 @@ ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> ) -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] -; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP2]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; TF-FIXEDLEN-NEXT: store i64 [[TMP3]], ptr [[B:%.*]], align 8 +; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; TF-FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; TF-FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-FIXEDLEN: middle.block: ; TF-FIXEDLEN-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; TF-FIXEDLEN-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1125,39 +1160,43 @@ ; SCALABLE-LABEL: @conditional_uniform_store( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; SCALABLE-NEXT: [[TMP3:%.*]] = add [[TMP2]], zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP4]] -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; SCALABLE-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; SCALABLE-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP8]]) -; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8 -; SCALABLE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1184,33 +1223,33 @@ ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT4]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x ptr> poison, ptr [[B]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT6]], <2 x ptr> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT2]], <4 x ptr> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT4]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x ptr> poison, ptr [[B]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT6]], <4 x ptr> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXEDLEN-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 -; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <2 x i64> [[STEP_ADD]], -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 8, <2 x i1> [[TMP2]]) -; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT7]], i32 8, <2 x i1> [[TMP3]]) +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT3]], i32 8, <4 x i1> [[TMP2]]) +; FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT5]], <4 x ptr> [[BROADCAST_SPLAT7]], i32 8, <4 x i1> [[TMP3]]) ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 -; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT5]], ptr [[TMP7]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], ; FIXEDLEN-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXEDLEN: middle.block: @@ -1240,43 +1279,47 @@ ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = add [[TMP3]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = mul [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 -; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP9]] +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP8]], i64 1024) -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP10]]) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] -; TF-SCALABLE-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; TF-SCALABLE-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer -; TF-SCALABLE-NEXT: [[TMP14:%.*]] = or [[TMP10]], [[TMP13]] -; TF-SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP15]], i32 8, [[TMP14]]) -; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] -; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; TF-SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP11]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp ugt [[VEC_IND]], shufflevector ( insertelement ( poison, i64 10, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP12]], zeroinitializer +; TF-SCALABLE-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]], i32 8, [[TMP13]]) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]] +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = xor [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; TF-SCALABLE-NEXT: [[TMP16:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; TF-SCALABLE-NEXT: [[TMP17:%.*]] = or [[TMP13]], [[TMP16]] +; TF-SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, [[TMP17]]) +; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] +; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; TF-SCALABLE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -1302,22 +1345,22 @@ ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[B:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x ptr> poison, ptr [[B:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT1]], <4 x ptr> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-FIXEDLEN-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], -; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <2 x i1> [[TMP1]]) +; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]]) ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; TF-FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; TF-FIXEDLEN-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-FIXEDLEN: middle.block: @@ -1368,26 +1411,29 @@ ; SCALABLE-LABEL: @uniform_store_unaligned( ; SCALABLE-NEXT: entry: ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 -; SCALABLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] -; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1409,23 +1455,23 @@ ; FIXEDLEN-NEXT: entry: ; FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FIXEDLEN: vector.ph: -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i64 0 -; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0 +; FIXEDLEN-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; FIXEDLEN: vector.body: ; FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 -; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 -; FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 -; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 +; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; FIXEDLEN-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FIXEDLEN: middle.block: @@ -1450,26 +1496,29 @@ ; TF-SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-SCALABLE: vector.ph: ; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]] -; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP0]] +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 -; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; TF-SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 -; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP3]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP3]] -; TF-SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) -; TF-SCALABLE-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] -; TF-SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -1490,8 +1539,8 @@ ; TF-FIXEDLEN-NEXT: entry: ; TF-FIXEDLEN-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TF-FIXEDLEN: vector.ph: -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i64 0 -; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0 +; TF-FIXEDLEN-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; TF-FIXEDLEN-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-FIXEDLEN: vector.body: ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1499,8 +1548,8 @@ ; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 -; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; TF-FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 +; TF-FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; TF-FIXEDLEN: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll @@ -12,27 +12,20 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[V:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i16> poison, i16 [[V]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT3]], <2 x i16> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[V:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <2 x i64> [[VEC_IND]] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], <2 x i64> [[STEP_ADD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP0]], i32 2, <2 x i1> , <2 x i16> poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP1]], i32 2, <2 x i1> , <2 x i16> poison) -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i16> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i16> [[WIDE_MASKED_GATHER2]], [[BROADCAST_SPLAT4]] -; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP2]], <2 x ptr> [[TMP0]], i32 2, <2 x i1> ) -; CHECK-NEXT: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> [[TMP3]], <2 x ptr> [[TMP1]], i32 2, <2 x i1> ) +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP0]], i32 2, <4 x i1> , <4 x i16> poison) +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i16> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[TMP1]], <4 x ptr> [[TMP0]], i32 2, <4 x i1> ) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -47,7 +40,7 @@ ; CHECK-NEXT: store i16 [[ADD]], ptr [[ARRAYIDX]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;