Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -272,7 +272,7 @@ bool HardenSlsRetBr = false; bool HardenSlsBlr = false; bool HardenSlsNoComdat = false; - uint8_t MaxInterleaveFactor = 2; + uint8_t MaxInterleaveFactor = 4; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; uint16_t PrefetchDistance = 0; Index: llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -21,38 +21,56 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 15 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 31 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934576 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560 ; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i16, i16* [[PSRC:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i16, i16* [[PDST:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT9]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT15]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT17]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT19]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 8 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[NEXT_GEP6]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* [[TMP8]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, i16* [[NEXT_GEP6]], i64 8 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* [[TMP10]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i64 24 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i16>, <8 x i16>* [[TMP9]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD12]], <8 x i16> [[BROADCAST_SPLAT16]]) +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD13]], <8 x i16> [[BROADCAST_SPLAT18]]) +; CHECK-NEXT: [[TMP13:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD14]], <8 x i16> [[BROADCAST_SPLAT20]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[NEXT_GEP8]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* [[TMP14]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i16, i16* [[NEXT_GEP8]], i64 8 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i16* [[TMP15]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* [[TMP16]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i16, i16* [[NEXT_GEP8]], i64 16 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[TMP17]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* [[TMP18]], align 2 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i16, i16* [[NEXT_GEP8]], i64 24 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16* [[TMP19]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> [[TMP13]], <8 x i16>* [[TMP20]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] @@ -66,10 +84,10 @@ ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP22:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP22]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i16 [[TMP13]], i16* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP23]], i16* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -116,87 +134,105 @@ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 7 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 31 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 63 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934560 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934528 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT12]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT14]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT16]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 16 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[NEXT_GEP3]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* [[TMP8]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i64 16 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 32 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i64 48 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, <16 x i8>* [[TMP9]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD9]], <16 x i8> [[BROADCAST_SPLAT13]]) +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD10]], <16 x i8> [[BROADCAST_SPLAT15]]) +; CHECK-NEXT: [[TMP13:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD11]], <16 x i8> [[BROADCAST_SPLAT17]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[NEXT_GEP5]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* [[TMP14]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i64 16 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[TMP16]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i64 32 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* [[TMP18]], align 2 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i64 48 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP13]], <16 x i8>* [[TMP20]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] -; CHECK-NEXT: [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24 +; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]] +; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]] +; CHECK-NEXT: [[CAST_CRD20:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END21:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD20]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 56 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], 1 -; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP14]], 8589934584 -; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32 +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[BLOCKSIZE]], -1 +; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP22]] to i64 +; CHECK-NEXT: [[TMP24:%.*]] = add nuw nsw i64 [[TMP23]], 1 +; CHECK-NEXT: [[N_VEC19:%.*]] = and i64 [[TMP24]], 8589934584 +; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC19]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]] -; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]] -; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC19]] +; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC19]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT34:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT35:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT34]], <8 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]] -; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2 -; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]]) -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2 -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX10]], 8 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT36:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP31:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX29]] +; CHECK-NEXT: [[NEXT_GEP32:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX29]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[NEXT_GEP31]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <8 x i8>, <8 x i8>* [[TMP25]], align 2 +; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD33]], <8 x i8> [[BROADCAST_SPLAT35]]) +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[NEXT_GEP32]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP26]], <8 x i8>* [[TMP27]], align 2 +; CHECK-NEXT: [[INDEX_NEXT36]] = add nuw i64 [[INDEX29]], 8 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT36]], [[N_VEC19]] +; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP14]], [[N_VEC9]] -; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[TMP24]], [[N_VEC19]] +; CHECK-NEXT: br i1 [[CMP_N28]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi i8* [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi i8* [ [[IND_END26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]]) +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP30:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP29]], i8 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i8 [[TMP30]], i8* [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]] Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll @@ -11,46 +11,76 @@ ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi i64 [ -1, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to * -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , * [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP19]] = and i64 [[TMP18]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD3]]) -; CHECK-NEXT: [[TMP21]] = and i64 [[TMP20]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 6 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i64, i64* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i64* [[TMP24]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP20]], i32 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i64* [[TMP28]] to * +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , * [[TMP29]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, i64* [[TMP20]], i32 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = bitcast i64* [[TMP32]] to * +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , * [[TMP33]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 6 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i64, i64* [[TMP20]], i32 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = bitcast i64* [[TMP36]] to * +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , * [[TMP37]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP39]] = and i64 [[TMP38]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD5]]) +; CHECK-NEXT: [[TMP41]] = and i64 [[TMP40]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD6]]) +; CHECK-NEXT: [[TMP43]] = and i64 [[TMP42]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD7]]) +; CHECK-NEXT: [[TMP45]] = and i64 [[TMP44]], [[VEC_PHI4]] +; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = and i64 [[TMP21]], [[TMP19]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = and i64 [[TMP41]], [[TMP39]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = and i64 [[TMP43]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX9:%.*]] = and i64 [[TMP45]], [[BIN_RDX8]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -58,34 +88,34 @@ ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 1, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 1, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[BIN_RDX9]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[N]], 2 -; CHECK-NEXT: [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]] +; CHECK-NEXT: [[N_MOD_VF11:%.*]] = urem i64 [[N]], 2 +; CHECK-NEXT: [[N_VEC12:%.*]] = sub i64 [[N]], [[N_MOD_VF11]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX8]], 0 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4 -; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD10]]) -; CHECK-NEXT: [[TMP30]] = and i64 [[TMP29]], [[VEC_PHI9]] -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2 -; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]] -; CHECK-NEXT: br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP54:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP49:%.*]] = add i64 [[INDEX14]], 0 +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP49]] +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i64, i64* [[TMP50]], i32 0 +; CHECK-NEXT: [[TMP52:%.*]] = bitcast i64* [[TMP51]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <2 x i64>, <2 x i64>* [[TMP52]], align 4 +; CHECK-NEXT: [[TMP53:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD16]]) +; CHECK-NEXT: [[TMP54]] = and i64 [[TMP53]], [[VEC_PHI15]] +; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX14]], 2 +; CHECK-NEXT: [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC12]] +; CHECK-NEXT: br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[N]], [[N_VEC6]] -; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[N]], [[N_VEC12]] +; CHECK-NEXT: br i1 [[CMP_N13]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP30]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[BIN_RDX9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP54]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX18]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] ; CHECK-NEXT: [[L3:%.*]] = load i64, i64* [[L2]], align 4 ; CHECK-NEXT: [[AND]] = and i64 [[RDX]], [[L3]] @@ -93,10 +123,10 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[AND_LCSSA4:%.*]] = phi i64 [ [[AND]], [[FOR_BODY]] ], [ [[TMP30]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[AND_LCSSA10:%.*]] = phi i64 [ [[AND]], [[FOR_BODY]] ], [ [[TMP54]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i64 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[AND_LCSSA4]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i64 [ [[BIN_RDX9]], [[MIDDLE_BLOCK]] ], [ [[AND_LCSSA10]], [[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: ret i64 [[AND_LCSSA]] ; entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -295,47 +295,87 @@ ; CHECK-LABEL: @gather_nxv4i32_ind64_stride2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 -; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[REASS_ADD:%.*]] = shl [[DOTSPLAT]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[STEP_ADD1:%.*]] = add [[VEC_IND]], [[REASS_ADD]] +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector [[DOTSPLATINSERT4]], poison, zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add [[DOTSPLAT5]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = add [[DOTSPLAT3]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = shl [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[B:%.*]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP11]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) -; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to * -; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], * [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 2 -; CHECK-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to * -; CHECK-NEXT: store [[WIDE_MASKED_GATHER6]], * [[TMP20]], align 4 -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP21]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = add [[DOTSPLAT5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT7:%.*]] = shufflevector [[DOTSPLATINSERT6]], poison, zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = add [[DOTSPLAT7]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = add [[DOTSPLAT5]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector [[DOTSPLATINSERT8]], poison, zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = add [[DOTSPLAT9]], [[TMP7]] +; CHECK-NEXT: [[TMP16:%.*]] = add [[DOTSPLAT5]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul nuw nsw i64 [[TMP17]], 12 +; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement poison, i64 [[TMP18]], i64 0 +; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector [[DOTSPLATINSERT10]], poison, zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = add [[DOTSPLAT11]], [[TMP7]] +; CHECK-NEXT: [[TMP20:%.*]] = add [[DOTSPLAT5]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = shl [[TMP8]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = shl [[TMP12]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP16]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = shl [[TMP20]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[B:%.*]], [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP22]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP23]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[B]], [[TMP24]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP25]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP26]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP27]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP28]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), undef) +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to * +; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], * [[TMP30]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP32:%.*]] = shl nuw nsw i32 [[TMP31]], 2 +; CHECK-NEXT: [[TMP33:%.*]] = zext i32 [[TMP32]] to i64 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to * +; CHECK-NEXT: store [[WIDE_MASKED_GATHER12]], * [[TMP35]], align 4 +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP37:%.*]] = shl nuw nsw i32 [[TMP36]], 3 +; CHECK-NEXT: [[TMP38:%.*]] = zext i32 [[TMP37]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP39]] to * +; CHECK-NEXT: store [[WIDE_MASKED_GATHER13]], * [[TMP40]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = mul nuw nsw i32 [[TMP41]], 12 +; CHECK-NEXT: [[TMP43:%.*]] = zext i32 [[TMP42]] to i64 +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to * +; CHECK-NEXT: store [[WIDE_MASKED_GATHER14]], * [[TMP45]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP47:%.*]] = shl nuw nsw i64 [[TMP46]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]] +; CHECK-NEXT: [[REASS_ADD15:%.*]] = shl [[DOTSPLAT]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD1]], [[REASS_ADD15]] +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -346,9 +386,9 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_STRIDE2]] -; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[TMP24]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: store float [[TMP49]], float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll @@ -8,10 +8,16 @@ ; CHECK: vector.body ; CHECK: %[[LOAD1:.*]] = load i128, i128* {{.*}} ; CHECK-NEXT: %[[LOAD2:.*]] = load i128, i128* {{.*}} +; CHECK-NEXT: %[[LOAD3:.*]] = load i128, i128* {{.*}} +; CHECK-NEXT: %[[LOAD4:.*]] = load i128, i128* {{.*}} ; CHECK-NEXT: %[[ADD1:.*]] = add nsw i128 %[[LOAD1]], 42 ; CHECK-NEXT: %[[ADD2:.*]] = add nsw i128 %[[LOAD2]], 42 +; CHECK-NEXT: %[[ADD3:.*]] = add nsw i128 %[[LOAD3]], 42 +; CHECK-NEXT: %[[ADD4:.*]] = add nsw i128 %[[LOAD4]], 42 ; CHECK-NEXT: store i128 %[[ADD1]], i128* {{.*}} ; CHECK-NEXT: store i128 %[[ADD2]], i128* {{.*}} +; CHECK-NEXT: store i128 %[[ADD3]], i128* {{.*}} +; CHECK-NEXT: store i128 %[[ADD4]], i128* {{.*}} entry: br label %for.body @@ -35,10 +41,16 @@ ; CHECK: vector.body ; CHECK: %[[LOAD1:.*]] = load fp128, fp128* ; CHECK-NEXT: %[[LOAD2:.*]] = load fp128, fp128* +; CHECK-NEXT: %[[LOAD3:.*]] = load fp128, fp128* +; CHECK-NEXT: %[[LOAD4:.*]] = load fp128, fp128* ; CHECK-NEXT: %[[FSUB1:.*]] = fsub fp128 %[[LOAD1]], 0xL00000000000000008000000000000000 ; CHECK-NEXT: %[[FSUB2:.*]] = fsub fp128 %[[LOAD2]], 0xL00000000000000008000000000000000 +; CHECK-NEXT: %[[FSUB3:.*]] = fsub fp128 %[[LOAD3]], 0xL00000000000000008000000000000000 +; CHECK-NEXT: %[[FSUB4:.*]] = fsub fp128 %[[LOAD4]], 0xL00000000000000008000000000000000 ; CHECK-NEXT: store fp128 %[[FSUB1]], fp128* {{.*}} ; CHECK-NEXT: store fp128 %[[FSUB2]], fp128* {{.*}} +; CHECK-NEXT: store fp128 %[[FSUB3]], fp128* {{.*}} +; CHECK-NEXT: store fp128 %[[FSUB4]], fp128* {{.*}} entry: br label %for.body @@ -62,8 +74,12 @@ ; CHECK: vector.body ; CHECK: %[[GEP1:.*]] = getelementptr inbounds i128, i128* %ptr ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i128, i128* %ptr +; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i128, i128* %ptr +; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i128, i128* %ptr ; CHECK-NEXT: store i128 %val, i128* %[[GEP1]] ; CHECK-NEXT: store i128 %val, i128* %[[GEP2]] +; CHECK-NEXT: store i128 %val, i128* %[[GEP3]] +; CHECK-NEXT: store i128 %val, i128* %[[GEP4]] entry: br label %for.body Index: llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll +++ llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll @@ -56,33 +56,53 @@ ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>* ; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8, !alias.scope !0 -; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD11]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP8:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD11]], -; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> , <2 x double> [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP8]], <2 x double> , <2 x double> [[WIDE_LOAD11]] -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP6]], <2 x double> zeroinitializer, <2 x double> [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8, !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 2 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP16]], align 8, !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000 -; CHECK-NEXT: br i1 [[TMP17]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP5]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 6 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP9:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD11]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD12]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD13]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD11]], +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD12]], +; CHECK-NEXT: [[TMP16:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD13]], +; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP13]], <2 x double> , <2 x double> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP14]], <2 x double> , <2 x double> [[WIDE_LOAD11]] +; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP15]], <2 x double> , <2 x double> [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP20:%.*]] = select <2 x i1> [[TMP16]], <2 x double> , <2 x double> [[WIDE_LOAD13]] +; CHECK-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP9]], <2 x double> zeroinitializer, <2 x double> [[TMP17]] +; CHECK-NEXT: [[TMP22:%.*]] = select <2 x i1> [[TMP10]], <2 x double> zeroinitializer, <2 x double> [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = select <2 x i1> [[TMP11]], <2 x double> zeroinitializer, <2 x double> [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = select <2 x i1> [[TMP12]], <2 x double> zeroinitializer, <2 x double> [[TMP20]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP21]], <2 x double>* [[TMP26]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP25]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast double* [[TMP27]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP22]], <2 x double>* [[TMP28]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP25]], i64 4 +; CHECK-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP23]], <2 x double>* [[TMP30]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP25]], i64 6 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast double* [[TMP31]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP24]], <2 x double>* [[TMP32]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000 +; CHECK-NEXT: br i1 [[TMP33]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: ; CHECK-NEXT: [[I_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_05]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP18]], 0.000000e+00 -; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP18]], 6.000000e+00 -; CHECK-NEXT: [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP18]] +; CHECK-NEXT: [[TMP34:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP34]], 0.000000e+00 +; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP34]], 6.000000e+00 +; CHECK-NEXT: [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP34]] ; CHECK-NEXT: [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IDXPROM]] ; CHECK-NEXT: store double [[RETVAL_0_I]], double* [[ARRAYIDX2]], align 8 Index: llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll +++ llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll @@ -34,51 +34,71 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 ; CHECK-NEXT: [[UMIN16:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN]], i64 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[UMIN16]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 5 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER22:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 9 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER32:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP1]], 7 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 8, i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> , i64 [[SUM_NEXT_PEEL]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI18:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI20:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI21:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI22:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 2 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[TMP5]], i64 4 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 2 +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x i64>, <2 x i64>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP5]], i64 6 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <2 x i64>, <2 x i64>* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[WIDE_LOAD19]], [[VEC_PHI18]] -; CHECK-NEXT: [[TMP15]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD20]] -; CHECK-NEXT: [[TMP16]] = add <2 x i64> [[TMP14]], [[WIDE_LOAD21]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD25:%.*]] = load <2 x i64>, <2 x i64>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <2 x i64>, <2 x i64>* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, i64* [[TMP13]], i64 2 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[TMP15]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD27:%.*]] = load <2 x i64>, <2 x i64>* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i64, i64* [[TMP13]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD28:%.*]] = load <2 x i64>, <2 x i64>* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, i64* [[TMP13]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i64* [[TMP19]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x i64>, <2 x i64>* [[TMP20]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP22:%.*]] = add <2 x i64> [[WIDE_LOAD23]], [[VEC_PHI20]] +; CHECK-NEXT: [[TMP23:%.*]] = add <2 x i64> [[WIDE_LOAD24]], [[VEC_PHI21]] +; CHECK-NEXT: [[TMP24:%.*]] = add <2 x i64> [[WIDE_LOAD25]], [[VEC_PHI22]] +; CHECK-NEXT: [[TMP25]] = add <2 x i64> [[TMP21]], [[WIDE_LOAD26]] +; CHECK-NEXT: [[TMP26]] = add <2 x i64> [[TMP22]], [[WIDE_LOAD27]] +; CHECK-NEXT: [[TMP27]] = add <2 x i64> [[TMP23]], [[WIDE_LOAD28]] +; CHECK-NEXT: [[TMP28]] = add <2 x i64> [[TMP24]], [[WIDE_LOAD29]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) -; CHECK-NEXT: br label [[LOOP_PREHEADER22]] -; CHECK: loop.preheader22: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP26]], [[TMP25]] +; CHECK-NEXT: [[BIN_RDX30:%.*]] = add <2 x i64> [[TMP27]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX31:%.*]] = add <2 x i64> [[TMP28]], [[BIN_RDX30]] +; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX31]]) +; CHECK-NEXT: br label [[LOOP_PREHEADER32]] +; CHECK: loop.preheader32: ; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER22]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12]] ], [ [[SUM_PH]], [[LOOP_PREHEADER22]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER32]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12]] ], [ [[SUM_PH]], [[LOOP_PREHEADER32]] ] ; CHECK-NEXT: [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]] ; CHECK-NEXT: br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]] ; CHECK: error.i: @@ -159,59 +179,87 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 ; CHECK-NEXT: [[UMIN29:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN28]], i64 [[TMP0]]) ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[UMIN29]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 5 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER37:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 9 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER49:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP1]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP1]], 7 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 8, i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> , i64 [[SUM_NEXT_PEEL]], i64 0 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI31:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI33:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI34:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI35:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 2 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_LOAD36:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[TMP5]], i64 4 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i64>, <2 x i64>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP9]], i64 2 +; CHECK-NEXT: [[WIDE_LOAD37:%.*]] = load <2 x i64>, <2 x i64>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[TMP5]], i64 6 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <2 x i64>, <2 x i64>* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[START_I14_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_LOAD38:%.*]] = load <2 x i64>, <2 x i64>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, i64* [[START_I2_PEEL]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD35:%.*]] = load <2 x i64>, <2 x i64>* [[TMP14]], align 4 +; CHECK-NEXT: [[WIDE_LOAD39:%.*]] = load <2 x i64>, <2 x i64>* [[TMP14]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, i64* [[TMP13]], i64 2 ; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[TMP15]] to <2 x i64>* -; CHECK-NEXT: [[WIDE_LOAD36:%.*]] = load <2 x i64>, <2 x i64>* [[TMP16]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i64> [[WIDE_LOAD32]], [[VEC_PHI31]] -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[TMP17]], [[WIDE_LOAD33]] -; CHECK-NEXT: [[TMP20:%.*]] = add <2 x i64> [[TMP18]], [[WIDE_LOAD34]] -; CHECK-NEXT: [[TMP21]] = add <2 x i64> [[TMP19]], [[WIDE_LOAD35]] -; CHECK-NEXT: [[TMP22]] = add <2 x i64> [[TMP20]], [[WIDE_LOAD36]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD40:%.*]] = load <2 x i64>, <2 x i64>* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i64, i64* [[TMP13]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD41:%.*]] = load <2 x i64>, <2 x i64>* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, i64* [[TMP13]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i64* [[TMP19]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD42:%.*]] = load <2 x i64>, <2 x i64>* [[TMP20]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[START_I14_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i64* [[TMP21]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD43:%.*]] = load <2 x i64>, <2 x i64>* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i64, i64* [[TMP21]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD44:%.*]] = load <2 x i64>, <2 x i64>* [[TMP24]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i64, i64* [[TMP21]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i64* [[TMP25]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD45:%.*]] = load <2 x i64>, <2 x i64>* [[TMP26]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i64, i64* [[TMP21]], i64 6 +; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD46:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i64> [[WIDE_LOAD36]], [[VEC_PHI33]] +; CHECK-NEXT: [[TMP31:%.*]] = add <2 x i64> [[WIDE_LOAD37]], [[VEC_PHI34]] +; CHECK-NEXT: [[TMP32:%.*]] = add <2 x i64> [[WIDE_LOAD38]], [[VEC_PHI35]] +; CHECK-NEXT: [[TMP33:%.*]] = add <2 x i64> [[TMP29]], [[WIDE_LOAD39]] +; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i64> [[TMP30]], [[WIDE_LOAD40]] +; CHECK-NEXT: [[TMP35:%.*]] = add <2 x i64> [[TMP31]], [[WIDE_LOAD41]] +; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i64> [[TMP32]], [[WIDE_LOAD42]] +; CHECK-NEXT: [[TMP37]] = add <2 x i64> [[TMP33]], [[WIDE_LOAD43]] +; CHECK-NEXT: [[TMP38]] = add <2 x i64> [[TMP34]], [[WIDE_LOAD44]] +; CHECK-NEXT: [[TMP39]] = add <2 x i64> [[TMP35]], [[WIDE_LOAD45]] +; CHECK-NEXT: [[TMP40]] = add <2 x i64> [[TMP36]], [[WIDE_LOAD46]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP22]], [[TMP21]] -; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) -; CHECK-NEXT: br label [[LOOP_PREHEADER37]] -; CHECK: loop.preheader37: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[BIN_RDX47:%.*]] = add <2 x i64> [[TMP39]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX48:%.*]] = add <2 x i64> [[TMP40]], [[BIN_RDX47]] +; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX48]]) +; CHECK-NEXT: br label [[LOOP_PREHEADER49]] +; CHECK: loop.preheader49: ; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER37]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24]] ], [ [[SUM_PH]], [[LOOP_PREHEADER37]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER49]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24]] ], [ [[SUM_PH]], [[LOOP_PREHEADER49]] ] ; CHECK-NEXT: [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]] ; CHECK-NEXT: br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]] ; CHECK: error.i: