Index: llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT7]], <16 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP1]], <16 x i32> undef), !alias.scope !3 ; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> @@ -67,7 +67,7 @@ ; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT19]], <8 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX12]], 8 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC11]] -; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[WIDE_MASKED_GATHER20:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT17]], i32 4, <8 x i1> [[TMP6]], <8 x i32> undef) ; CHECK-NEXT: [[PREDPHI21:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[WIDE_MASKED_GATHER20]], <8 x i32> @@ -79,9 +79,9 @@ ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32* [[A]], null -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[LATCH]], label [[COND_LOAD:%.*]] ; CHECK: cond_load: ; CHECK-NEXT: [[ALOAD:%.*]] = load i32, i32* [[A]], align 4 @@ -90,7 +90,7 @@ ; CHECK-NEXT: [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[A_LCSSA_LCSSA8:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] @@ -104,10 +104,10 @@ for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 %cmp = icmp ne i32* %a, null - store i32 %ntrunc, i32* %tmp1 + store i32 %ntrunc, i32* %i1 br i1 %cmp, label %cond_load, label %latch cond_load: Index: llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -52,7 +52,7 @@ ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]] @@ -73,7 +73,7 @@ ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T4]] @@ -140,7 +140,7 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT7]], <16 x i32*> [[BROADCAST_SPLAT9]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !11 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -170,7 +170,7 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT20]], <8 x i32*> [[BROADCAST_SPLAT22]], i32 4, <8 x i1> [[TMP7]]) ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX13]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC12]] -; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC12]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] @@ -190,7 +190,7 @@ ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], [[LOOP16:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: @@ -274,7 +274,7 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT20]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !24, !noalias !23 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP25:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX15]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -307,7 +307,7 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD32]], <8 x i32*> [[BROADCAST_SPLAT34]], i32 4, <8 x i1> [[TMP9]]) ; CHECK-NEXT: [[INDEX_NEXT25]] = add nuw i64 [[INDEX24]], 8 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC23]] -; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[SMAX21]], [[N_VEC23]] ; CHECK-NEXT: br i1 [[CMP_N26]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] @@ -329,7 +329,7 @@ ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], [[LOOP27:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: Index: llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -0,0 +1,291 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s + +; First licm pass is to hoist/sink invariant stores if possible. Today LICM does +; not hoist/sink the invariant stores. Even if that changes, we should still +; vectorize this loop in case licm is not run. + +; The next licm pass after vectorization is to hoist/sink loop invariant +; instructions. +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; all tests check that it is legal to vectorize the stores to invariant +; address. + +; Instcombine'd version of above test. Now the store is no longer of invariant +; value. +; scalar store the value extracted from the last element of the vector value. +define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_diff_values_ic( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[SMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX4]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A3]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX4]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: store i32 [[TMP3]], i32* [[A]], align 4, !alias.scope !3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] +; CHECK: cond_store: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: cond_store_k: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] +; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 + %cmp = icmp eq i32 %i2, %k + store i32 %ntrunc, i32* %i1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; invariant val stored to invariant address predicated on invariant condition +; This is not treated as a predicated store since the block the store belongs to +; is the latch block (which doesn't need to be predicated). +; variant/invariant values being stored to invariant address. +; test checks that the last element of the phi is extracted and scalar stored +; into the uniform address within the loop. +; Since the condition and the phi is loop invariant, they are LICM'ed after +; vectorization. +define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_inv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]] +; CHECK-NEXT: [[SMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX4]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A3]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX4]], 9223372036854775804 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[A]], align 4, !alias.scope !11 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[I1]], align 4 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] +; CHECK: cond_store: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: cond_store_k: +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] +; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %ntrunc = trunc i64 %n to i32 + %cmp = icmp eq i32 %ntrunc, %k + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 + store i32 %ntrunc, i32* %i1 + br i1 %cmp, label %cond_store, label %cond_store_k + +cond_store: + br label %latch + +cond_store_k: + br label %latch + +latch: + %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] + store i32 %storeval, i32* %a + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; variant value stored to uniform address tests that the code gen extracts the +; last element from the variant vector and scalar stores it into the uniform +; address. +define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @variant_val_store_to_inv_address( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX4:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX4]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX4]], 9223372036854775804 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX4]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[I2:%.*]] = load i32, i32* [[I1]], align 8 +; CHECK-NEXT: store i32 [[I2]], i32* [[A]], align 4 +; CHECK-NEXT: [[I3]] = add i32 [[I0]], [[I2]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: [[I3_LCSSA:%.*]] = phi i32 [ [[I3]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[I3_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RDX_LCSSA]] +; +entry: + %ntrunc = trunc i64 %n to i32 + %cmp = icmp eq i32 %ntrunc, %k + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i0 = phi i32 [ %i3, %for.body ], [ 0, %entry ] + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 + store i32 %i2, i32* %a + %i3 = add i32 %i0, %i2 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + %rdx.lcssa = phi i32 [ %i3, %for.body ] + ret i32 %rdx.lcssa +} Index: llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -35,18 +35,18 @@ for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 - %tmp3 = add i32 %tmp0, %tmp2 + %i0 = phi i32 [ %i3, %for.body ], [ 0, %entry ] + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 + %i3 = add i32 %i0, %i2 store i32 %ntrunc, i32* %a %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end for.end: ; preds = %for.body - %tmp4 = phi i32 [ %tmp3, %for.body ] - ret i32 %tmp4 + %i4 = phi i32 [ %i3, %for.body ] + ret i32 %i4 } ; CHECK-LABEL: inv_val_store_to_inv_address( @@ -63,10 +63,10 @@ for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 store i32 %ntrunc, i32* %a - store i32 %ntrunc, i32* %tmp1 + store i32 %ntrunc, i32* %i1 %i.next = add nuw nsw i64 %i, 1 %cond = icmp slt i64 %i.next, %n br i1 %cond, label %for.body, label %for.end @@ -104,10 +104,10 @@ for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 - %cmp = icmp eq i32 %tmp2, %k - store i32 %ntrunc, i32* %tmp1 + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 + %cmp = icmp eq i32 %i2, %k + store i32 %ntrunc, i32* %i1 br i1 %cmp, label %cond_store, label %latch cond_store: @@ -137,10 +137,10 @@ for.body: ; preds = %for.body, %entry %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 - %cmp = icmp eq i32 %tmp2, %k - store i32 %ntrunc, i32* %tmp1 + %i1 = getelementptr inbounds i32, i32* %b, i64 %i + %i2 = load i32, i32* %i1, align 8 + %cmp = icmp eq i32 %i2, %k + store i32 %ntrunc, i32* %i1 br i1 %cmp, label %cond_store, label %cond_store_k cond_store: @@ -160,284 +160,6 @@ ret void } -; Instcombine'd version of above test. Now the store is no longer of invariant -; value. -; scalar store the value extracted from the last element of the vector value. -; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic -; CHECK-NEXT: entry: -; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32> [[BROADCAST_SPLAT6]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[A]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]] -; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] -; CHECK: cond_store: -; CHECK-NEXT: br label [[LATCH]] -; CHECK: cond_store_k: -; CHECK-NEXT: br label [[LATCH]] -; CHECK: latch: -; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] -; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] -; CHECK: for.end.loopexit: -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) { -entry: - %ntrunc = trunc i64 %n to i32 - br label %for.body - -for.body: ; preds = %for.body, %entry - %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 - %cmp = icmp eq i32 %tmp2, %k - store i32 %ntrunc, i32* %tmp1 - br i1 %cmp, label %cond_store, label %cond_store_k - -cond_store: - br label %latch - -cond_store_k: - br label %latch - -latch: - %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] - store i32 %storeval, i32* %a - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - br i1 %cond, label %for.body, label %for.end - -for.end: ; preds = %for.body - ret void -} - -; invariant val stored to invariant address predicated on invariant condition -; This is not treated as a predicated store since the block the store belongs to -; is the latch block (which doesn't need to be predicated). -; variant/invariant values being stored to invariant address. -; test checks that the last element of the phi is extracted and scalar stored -; into the uniform address within the loop. -; Since the condition and the phi is loop invariant, they are LICM'ed after -; vectorization. -; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv -; CHECK-NEXT: entry: -; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]] -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[SMAX2:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[K]], i32 3 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP5]], i32* [[A]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]] -; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 -; CHECK-NEXT: br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]] -; CHECK: cond_store: -; CHECK-NEXT: br label [[LATCH]] -; CHECK: cond_store_k: -; CHECK-NEXT: br label [[LATCH]] -; CHECK: latch: -; CHECK-NEXT: [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ] -; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] -; CHECK: for.end.loopexit: -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) { -entry: - %ntrunc = trunc i64 %n to i32 - %cmp = icmp eq i32 %ntrunc, %k - br label %for.body - -for.body: ; preds = %for.body, %entry - %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 - store i32 %ntrunc, i32* %tmp1 - br i1 %cmp, label %cond_store, label %cond_store_k - -cond_store: - br label %latch - -cond_store_k: - br label %latch - -latch: - %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ] - store i32 %storeval, i32* %a - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - br i1 %cond, label %for.body, label %for.end - -for.end: ; preds = %for.body - ret void -} - -; variant value stored to uniform address tests that the code gen extracts the -; last element from the variant vector and scalar stores it into the uniform -; address. -define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { -; CHECK-LABEL: @variant_val_store_to_inv_address( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1 -; CHECK-NEXT: [[SMAX3:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX3]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8, !alias.scope !36 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4, !alias.scope !39, !noalias !36 -; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]] -; CHECK: middle.block: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 -; CHECK-NEXT: [[TMP3]] = add i32 [[TMP0]], [[TMP2]] -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], [[LOOP42:!llvm.loop !.*]] -; CHECK: for.end.loopexit: -; CHECK-NEXT: [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ] -; CHECK-NEXT: br label [[FOR_END]] -; CHECK: for.end: -; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[RDX_LCSSA]] -; -entry: - %ntrunc = trunc i64 %n to i32 - %cmp = icmp eq i32 %ntrunc, %k - br label %for.body - -for.body: ; preds = %for.body, %entry - %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] - %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ] - %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i - %tmp2 = load i32, i32* %tmp1, align 8 - store i32 %tmp2, i32* %a - %tmp3 = add i32 %tmp0, %tmp2 - %i.next = add nuw nsw i64 %i, 1 - %cond = icmp slt i64 %i.next, %n - br i1 %cond, label %for.body, label %for.end - -for.end: ; preds = %for.body - %rdx.lcssa = phi i32 [ %tmp3, %for.body ] - ret i32 %rdx.lcssa -} - ; Multiple variant stores to the same uniform address ; We do not vectorize such loops currently. ; for(; i < itr; i++) { @@ -544,43 +266,43 @@ ret i32 undef } -; cannot vectorize loop with unsafe dependency between uniform load (%tmp10) and store -; (%tmp12) to the same address +; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store +; (%i12) to the same address ; PR39653 -; Note: %tmp10 could be replaced by phi(%arg4, %tmp12), a potentially vectorizable +; Note: %i10 could be replaced by phi(%arg4, %i12), a potentially vectorizable ; 1st-order-recurrence define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) { ; CHECK-LABEL: unsafe_dep_uniform_load_store ; CHECK-NOT: <4 x i32> bb: - %tmp = alloca i32 - store i32 %arg4, i32* %tmp - %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5 + %i = alloca i32 + store i32 %arg4, i32* %i + %i6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5 br label %bb7 bb7: - %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ] - %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ] - %tmp10 = load i32, i32* %tmp - %tmp11 = mul nsw i32 %tmp9, %tmp10 - %tmp12 = srem i32 %tmp11, 65536 - %tmp13 = add nsw i32 %tmp12, %tmp9 - %tmp14 = trunc i32 %tmp13 to i16 - %tmp15 = trunc i64 %tmp8 to i32 - %tmp16 = add i32 %arg, %tmp15 - %tmp17 = zext i32 %tmp16 to i64 - %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17 - store i16 %tmp14, i16* %tmp18, align 2 - %tmp19 = add i32 %tmp13, %tmp9 - %tmp20 = trunc i32 %tmp19 to i16 - %tmp21 = and i16 %tmp20, 255 - %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17 - store i16 %tmp21, i16* %tmp22, align 2 - %tmp23 = add nsw i32 %tmp9, 1 - %tmp24 = add nuw nsw i64 %tmp8, 1 - %tmp25 = icmp eq i64 %tmp24, %arg2 - store i32 %tmp12, i32* %tmp - br i1 %tmp25, label %bb26, label %bb7 + %i8 = phi i64 [ 0, %bb ], [ %i24, %bb7 ] + %i9 = phi i32 [ %arg1, %bb ], [ %i23, %bb7 ] + %i10 = load i32, i32* %i + %i11 = mul nsw i32 %i9, %i10 + %i12 = srem i32 %i11, 65536 + %i13 = add nsw i32 %i12, %i9 + %i14 = trunc i32 %i13 to i16 + %i15 = trunc i64 %i8 to i32 + %i16 = add i32 %arg, %i15 + %i17 = zext i32 %i16 to i64 + %i18 = getelementptr inbounds i16, i16* %i6, i64 %i17 + store i16 %i14, i16* %i18, align 2 + %i19 = add i32 %i13, %i9 + %i20 = trunc i32 %i19 to i16 + %i21 = and i16 %i20, 255 + %i22 = getelementptr inbounds i16, i16* %arg3, i64 %i17 + store i16 %i21, i16* %i22, align 2 + %i23 = add nsw i32 %i9, 1 + %i24 = add nuw nsw i64 %i8, 1 + %i25 = icmp eq i64 %i24, %arg2 + store i32 %i12, i32* %i + br i1 %i25, label %bb26, label %bb7 bb26: ret void Index: llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -80,7 +80,7 @@ ; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !alias.scope !5, !noalias !7 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]] @@ -105,7 +105,7 @@ ; CHECK-NEXT: store i32 [[TMP20]], i32* [[TMP18]], align 4 ; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 ; CHECK-NEXT: [[EXITCOND_INNER:%.*]] = icmp eq i64 [[J_NEXT]], [[Z]] -; CHECK-NEXT: br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], !llvm.loop [[LOOP10:![0-9]+]] ; br label %.outer.preheader Index: llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll +++ llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll @@ -11,24 +11,31 @@ define void @f() { ; CHECK-LABEL: @f( - +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] ; CHECK: outer.header: ; CHECK-NEXT: [[TMP0:%.*]] = load i8*, i8** @d, align 1 ; CHECK-NEXT: [[C_0:%.*]] = call i1 @cond() -; CHECK-NEXT: br i1 [[C_0]], label %outer.exit.0, label %inner.1.header.preheader - +; CHECK-NEXT: br i1 [[C_0]], label [[OUTER_EXIT_0:%.*]], label [[INNER_1_HEADER_PREHEADER:%.*]] +; CHECK: inner.1.header.preheader: +; CHECK-NEXT: br label [[INNER_1_HEADER:%.*]] +; CHECK: inner.1.header: +; CHECK-NEXT: [[C_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_1]], label [[INNER_1_LATCH:%.*]], label [[OUTER_LATCH:%.*]] +; CHECK: inner.1.latch: +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label [[OUTER_EXIT_1:%.*]], label [[INNER_1_HEADER]] +; CHECK: outer.latch: +; CHECK-NEXT: br label [[OUTER_HEADER]] ; CHECK: outer.exit.0: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i8* [ [[TMP0]], %outer.header ] -; CHECK-NEXT: br label %loop.preheader - +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i8* [ [[TMP0]], [[OUTER_HEADER]] ] +; CHECK-NEXT: br label [[LOOP_PREHEADER:%.*]] ; CHECK: outer.exit.1: -; CHECK-NEXT: [[DOTLCSSA1:%.*]] = phi i8* [ [[TMP0]], %inner.1.latch ] -; CHECK-NEXT: br label %loop.preheader - +; CHECK-NEXT: [[DOTLCSSA1:%.*]] = phi i8* [ [[TMP0]], [[INNER_1_LATCH]] ] +; CHECK-NEXT: br label [[LOOP_PREHEADER]] ; CHECK: loop.preheader: -; CHECK-NEXT: [[TMP1:%.*]] = phi i8* [ [[DOTLCSSA]], %outer.exit.0 ], [ [[DOTLCSSA1]], %outer.exit.1 ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i8* [ [[DOTLCSSA]], [[OUTER_EXIT_0]] ], [ [[DOTLCSSA1]], [[OUTER_EXIT_1]] ] ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] - ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[TMP1]], i64 1 ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* bitcast (i32* @f.e to i8*), [[SCEVGEP]] @@ -36,10 +43,8 @@ ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true ; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] - ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] - ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 @@ -49,15 +54,23 @@ ; CHECK-NEXT: store i8 10, i8* [[TMP0]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500 -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] - +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 500, 500 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] - ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 500, %middle.block ], [ 0, %loop.preheader ], [ 0, %vector.memcheck ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[CONV6_US_US_US:%.*]] = zext i1 false to i32 +; CHECK-NEXT: store i32 [[CONV6_US_US_US]], i32* @f.e, align 1 +; CHECK-NEXT: store i8 10, i8* [[TMP1]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 500 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void ; entry: br label %outer.header Index: llvm/test/Transforms/LoopVectorize/pr50686.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr50686.ll +++ llvm/test/Transforms/LoopVectorize/pr50686.ll @@ -7,14 +7,6 @@ ; CHECK-LABEL: @m( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = bitcast i32* [[P:%.*]] to i8* -; CHECK-NEXT: [[I:%.*]] = load i32, i32* @k, align 4 -; CHECK-NEXT: [[CMP32:%.*]] = icmp slt i32 [[I]], [[Q:%.*]] -; CHECK-NEXT: br i1 [[CMP32]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND2_PREHEADER:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond2.preheader.loopexit: -; CHECK-NEXT: br label [[FOR_COND2_PREHEADER]] -; CHECK: for.cond2.preheader: ; CHECK-NEXT: [[ARRAYIDX9_1:%.*]] = getelementptr inbounds i32, i32* [[P2:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX9_2:%.*]] = getelementptr inbounds i32, i32* [[P2]], i64 2 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] @@ -58,20 +50,10 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 63, 60 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END17:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND2_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_COND5_PREHEADER:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I1:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[I]], [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I1]] to i64 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[P2]], i64 [[IDXPROM]] -; CHECK-NEXT: store i32 2, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[I2:%.*]] = load i32, i32* @k, align 4 -; CHECK-NEXT: [[INC]] = add nsw i32 [[I2]], 1 -; CHECK-NEXT: store i32 [[INC]], i32* @k, align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[Q]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND2_PREHEADER_LOOPEXIT:%.*]] -; CHECK: for.cond5.preheader: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND5_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_COND5:%.*]] +; CHECK: for.cond5: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND5]] ] ; CHECK-NEXT: [[I3:%.*]] = load i32, i32* [[P2]], align 4 ; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[I3]] ; CHECK-NEXT: [[I4:%.*]] = load i32, i32* [[ARRAYIDX9_1]], align 4 @@ -82,39 +64,17 @@ ; CHECK-NEXT: store i32 [[SUB_2]], i32* [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 63 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END17]], label [[FOR_COND5_PREHEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END17]], label [[FOR_COND5]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end17: ; CHECK-NEXT: ret void ; entry: - %i = load i32, i32* @k, align 4 - %cmp32 = icmp slt i32 %i, %q - br i1 %cmp32, label %for.body.preheader, label %for.cond2.preheader - -for.body.preheader: ; preds = %entry - br label %for.body - -for.cond2.preheader.loopexit: ; preds = %for.body - br label %for.cond2.preheader - -for.cond2.preheader: ; preds = %for.cond2.preheader.loopexit, %entry %arrayidx9.1 = getelementptr inbounds i32, i32* %p2, i64 1 %arrayidx9.2 = getelementptr inbounds i32, i32* %p2, i64 2 - br label %for.cond5.preheader - -for.body: ; preds = %for.body, %for.body.preheader - %i1 = phi i32 [ %inc, %for.body ], [ %i, %for.body.preheader ] - %idxprom = sext i32 %i1 to i64 - %arrayidx = getelementptr inbounds i32, i32* %p2, i64 %idxprom - store i32 2, i32* %arrayidx, align 4 - %i2 = load i32, i32* @k, align 4 - %inc = add nsw i32 %i2, 1 - store i32 %inc, i32* @k, align 4 - %cmp = icmp slt i32 %inc, %q - br i1 %cmp, label %for.body, label %for.cond2.preheader.loopexit + br label %for.cond5 -for.cond5.preheader: ; preds = %for.cond5.preheader, %for.cond2.preheader - %indvars.iv = phi i64 [ 0, %for.cond2.preheader ], [ %indvars.iv.next, %for.cond5.preheader ] +for.cond5: ; preds = %entry, %for.cond5 + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond5 ] %i3 = load i32, i32* %p2, align 4 %sub = sub nsw i32 0, %i3 %i4 = load i32, i32* %arrayidx9.1, align 4 @@ -125,8 +85,8 @@ store i32 %sub.2, i32* %arrayidx14, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 63 - br i1 %exitcond, label %for.end17, label %for.cond5.preheader + br i1 %exitcond, label %for.end17, label %for.cond5 -for.end17: ; preds = %for.cond5.preheader +for.end17: ; preds = %for.cond5 ret void } Index: llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll +++ llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll @@ -21,8 +21,8 @@ ; CHECK-NEXT: br i1 [[C_2]], label [[OUTER_LATCH:%.*]], label [[INNER_BB:%.*]] ; CHECK: inner.bb: ; CHECK-NEXT: [[C_3:%.*]] = call i1 @cond() -; CHECK-NEXT: br i1 [[C_3]], label [[LOOP3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]] -; CHECK: loop3.preheader: +; CHECK-NEXT: br i1 [[C_3]], label [[LOOP_3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]] +; CHECK: loop.3.preheader: ; CHECK-NEXT: [[L_1_LCSSA8:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_1_LCSSA:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_2_LCSSA:%.*]] = phi i16* [ [[L_2]], [[INNER_BB]] ] @@ -61,27 +61,27 @@ ; CHECK-NEXT: store i16 [[TMP9]], i16* [[TMP7]], align 2, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[LOOP3:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[LOOP_3:%.*]] ; CHECK: inner.latch: ; CHECK-NEXT: [[C_4:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_4]], label [[EXIT_LOOPEXIT1:%.*]], label [[INNER]] ; CHECK: outer.latch: ; CHECK-NEXT: br label [[OUTER_BACKEDGE]] -; CHECK: loop3: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK: loop.3: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[C_5:%.*]] = icmp ult i64 [[IV]], [[N]] ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_1_LCSSA]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[LOOP_L_1:%.*]] = load i16, i16* [[GEP_1]], align 2 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_2_LCSSA]], i64 0 ; CHECK-NEXT: store i16 [[LOOP_L_1]], i16* [[GEP_2]], align 2 -; CHECK-NEXT: br i1 [[C_5]], label [[LOOP3]], label [[EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[C_5]], label [[LOOP_3]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit.loopexit1: @@ -107,7 +107,7 @@ inner.bb: ; preds = %bb3 %c.3 = call i1 @cond() - br i1 %c.3, label %loop3, label %inner.latch + br i1 %c.3, label %loop.3, label %inner.latch inner.latch: ; preds = %bb4 %c.4 = call i1 @cond() @@ -116,15 +116,15 @@ outer.latch: ; preds = %bb3 br label %outer -loop3: ; preds = %bb9, %bb4 - %iv = phi i64 [ %iv.next, %loop3 ], [ 0, %inner.bb ] +loop.3: ; preds = %bb9, %bb4 + %iv = phi i64 [ %iv.next, %loop.3 ], [ 0, %inner.bb ] %iv.next = add nsw nuw i64 %iv, 1 %c.5 = icmp ult i64 %iv, %N %gep.1 = getelementptr inbounds i16, i16* %l.1, i64 %iv.next %loop.l.1 = load i16, i16* %gep.1, align 2 %gep.2 = getelementptr inbounds i16, i16* %l.2, i64 0 store i16 %loop.l.1, i16* %gep.2 , align 2 - br i1 %c.5, label %loop3, label %exit + br i1 %c.5, label %loop.3, label %exit exit: ; preds = %bb15, %bb5 %l.3 = load i16, i16* %l.1, align 2