Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -0,0 +1,2229 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -mattr=+sve -scalable-vectorization=on -runtime-memory-check-threshold=24 < %s | FileCheck %s + +; At the moment LLVM is not capable to vectorize interleaved accesses operating +; on scalable vectors. This test is checking if the LV decides to use +; gather/satter for such cases. + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Check vectorization on an interleaved load group of factor 2 and an interleaved +; store group of factor 2. + +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) #1 { +; CHECK-LABEL: @test_array_load2_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP9]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, [[TMP7]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP10]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], [[C]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[D]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP1]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx0, align 4 + %tmp1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 + %tmp2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %tmp, %C + %mul = mul nsw i32 %tmp2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of factor 2 with narrower types and an interleaved +; store group of factor 2. + +; short AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; short A = AB[i]; +; short B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + + +@AB_i16 = common global [1024 x i16] zeroinitializer, align 4 + +define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { +; CHECK-LABEL: @test_array_load2_i16_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP6]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP8]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[BROADCAST_SPLAT]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP10]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = sext [[WIDE_MASKED_GATHER1]] to +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw [[BROADCAST_SPLAT3]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, [[TMP7]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP13]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[CONV]], [[C]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[CONV6:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV6]], [[D]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP19]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [1024 x i16], [1024 x i16]* @AB_i16, i64 0, i64 %1 + %2 = load i16, i16* %arrayidx2, align 2 + %conv = sext i16 %0 to i32 + %add3 = add nsw i32 %conv, %C + %arrayidx5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add3, i32* %arrayidx5, align 4 + %conv6 = sext i16 %2 to i32 + %mul = mul nsw i32 %conv6, %D + %arrayidx9 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv, 1022 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of factor 2 and an interleaved +; store group of factor 2 with narrower types. + +; int AB[1024]; +; short CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; short A = AB[i]; +; short B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + + +@CD_i16 = dso_local local_unnamed_addr global [1024 x i16] zeroinitializer, align 2 + +define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 { +; CHECK-LABEL: @test_array_load2_store2_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc [[TMP9]] to +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[TMP10]], [[TMP11]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc [[TMP12]] to +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, [[TMP7]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[TMP13]], [[TMP14]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[C]] +; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[ADD3]] to i16 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i16 [[CONV]], i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP20]], [[D]] +; CHECK-NEXT: [[CONV6:%.*]] = trunc i32 [[MUL]] to i16 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 [[TMP19]] +; CHECK-NEXT: store i16 [[CONV6]], i16* [[ARRAYIDX9]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %0, %C + %conv = trunc i32 %add3 to i16 + %arrayidx5 = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 %indvars.iv + store i16 %conv, i16* %arrayidx5, align 2 + %mul = mul nsw i32 %2, %D + %conv6 = trunc i32 %mul to i16 + %arrayidx9 = getelementptr inbounds [1024 x i16], [1024 x i16]* @CD_i16, i64 0, i64 %1 + store i16 %conv6, i16* %arrayidx9, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv, 1022 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +%struct.ST3 = type { i32, i32, i32 } +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() #1 { +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[N_VEC]], 3 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i64 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, [[TMP9]], i64 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, [[TMP9]], i64 2 +; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, [[VEC_IND]], i32 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP12]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = add nsw [[WIDE_MASKED_GATHER4]], shufflevector ( insertelement ( poison, i32 2, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, [[VEC_IND]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP14]], [[TMP15]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = add nsw [[WIDE_MASKED_GATHER5]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, [[VEC_IND]], i32 2 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP16]], [[TMP17]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[PTR_016:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PTR_016]], i64 1 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR_016]], align 4 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[PTR_016]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[INCDEC_PTR2]] = getelementptr inbounds i32, i32* [[PTR_016]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: store i32 [[ADD]], i32* [[X]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 2 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: store i32 [[ADD3]], i32* [[Y]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 3 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: store i32 [[ADD6]], i32* [[Z]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %tmp = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %tmp1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %tmp2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %tmp, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %tmp1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %tmp2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of factor 4. + +; struct ST4{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST4 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) #1 { +; CHECK-LABEL: @test_struct_load4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP5]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP6:%.*]] = add [[WIDE_MASKED_GATHER]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP7]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], [[VEC_IND]], i32 2 +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], [[VEC_IND]], i32 3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP10:%.*]] = add [[TMP6]], [[WIDE_MASKED_GATHER2]] +; CHECK-NEXT: [[TMP11:%.*]] = add [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER3]] +; CHECK-NEXT: [[TMP12]] = sub [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP12]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_022:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB8:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[X]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], [[R_022]] +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[Y]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[Z]], align 4 +; CHECK-NEXT: [[W:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[S]], i64 [[INDVARS_IV]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[W]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[ADD]], [[TMP2]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[SUB8]] = sub i32 [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ [[SUB8]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[SUB8_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %tmp = load i32, i32* %x, align 4 + %add = add nsw i32 %tmp, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %tmp1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %tmp1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %tmp2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %tmp2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %tmp3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %tmp3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + + +; Check vectorization on an interleaved load group of factor 6. +; There is no dedicated ldN/stN so use gather instead + +%struct.ST6 = type { i32, i32, i32, i32, i32, i32 } + +define i32 @test_struct_load6(%struct.ST6* %S) #1 { +; CHECK-LABEL: @test_struct_load6( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], %struct.ST6* [[S:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP5]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 2 +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP7]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 4 +; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], [[VEC_IND]], i32 5 +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP11:%.*]] = add [[WIDE_MASKED_GATHER]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP11]], [[WIDE_MASKED_GATHER2]] +; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER3]] +; CHECK-NEXT: [[TMP14:%.*]] = add [[TMP13]], [[WIDE_MASKED_GATHER4]] +; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP14]], [[WIDE_MASKED_GATHER5]] +; CHECK-NEXT: [[TMP16]] = sub [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_041:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB14:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[X]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[Y]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[Z]], align 4 +; CHECK-NEXT: [[W:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[W]], align 4 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 4 +; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6]], %struct.ST6* [[S]], i64 [[INDVARS_IV]], i32 5 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[B]], align 4 +; CHECK-NEXT: [[DOTNEG36:%.*]] = add i32 [[TMP21]], [[R_041]] +; CHECK-NEXT: [[TMP27:%.*]] = add i32 [[DOTNEG36]], [[TMP23]] +; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP22]], [[TMP24]] +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP28]], [[TMP25]] +; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP29]], [[TMP26]] +; CHECK-NEXT: [[SUB14]] = sub i32 [[TMP27]], [[TMP30]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUB14_LCSSA:%.*]] = phi i32 [ [[SUB14]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[SUB14_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.041 = phi i32 [ 0, %entry ], [ %sub14, %for.body ] + %x = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %y = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %z = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %w = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %a = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 4 + %4 = load i32, i32* %a, align 4 + %b = getelementptr inbounds %struct.ST6, %struct.ST6* %S, i64 %indvars.iv, i32 5 + %5 = load i32, i32* %b, align 4 + %.neg36 = add i32 %0, %r.041 + %6 = add i32 %.neg36, %2 + %7 = add i32 %1, %3 + %8 = add i32 %7, %4 + %9 = add i32 %8, %5 + %sub14 = sub i32 %6, %9 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body + %sub14.lcssa = phi i32 [ %sub14, %for.body ] + ret i32 %sub14.lcssa +} + +; Check vectorization on an interleaved store group of factor 4. + +; void test_struct_store4(int *A, struct ST4 *B) { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; B[i].x = X + 1; +; B[i].y = X * 2; +; B[i].z = X + 3; +; B[i].w = X + 4; +; } +; } + + +define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) #1 { +; CHECK-LABEL: @test_struct_store4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[NEXT_GEP]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP6]], [[TMP7]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP8:%.*]] = shl nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], [[VEC_IND]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP8]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], [[VEC_IND]], i32 2 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP10]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 4, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], [[VEC_IND]], i32 3 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP12]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[PTR_024:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[PTR_024]], i64 1 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR_024]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: store i32 [[ADD]], i32* [[X]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: store i32 [[MUL]], i32* [[Y]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP]], 3 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 2 +; CHECK-NEXT: store i32 [[ADD3]], i32* [[Z]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP]], 4 +; CHECK-NEXT: [[W:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 3 +; CHECK-NEXT: store i32 [[ADD6]], i32* [[W]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 + %tmp = load i32, i32* %ptr.024, align 4 + %add = add nsw i32 %tmp, 1 + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %tmp, 1 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %add3 = add nsw i32 %tmp, 3 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 + store i32 %add3, i32* %z, align 4 + %add6 = add nsw i32 %tmp, 4 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 + store i32 %add6, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Check vectorization on a reverse interleaved load group of factor 2 and +; a reverse interleaved store group of factor 2. + +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + + +%struct.ST2 = type { i32, i32 } + +define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) #1 { +; CHECK-LABEL: @test_reversed_load2_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add nsw i64 [[N_MOD_VF]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i64 1023, i32 0), poison, zeroinitializer), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP3]], -4 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[DOTNEG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION1:%.*]] = sub shufflevector ( insertelement ( poison, i32 1023, i32 0), poison, zeroinitializer), [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG7:%.*]] = mul nsw i32 [[TMP5]], -4 +; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement poison, i32 [[DOTNEG7]], i64 0 +; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector [[DOTSPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND4:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[VEC_IND4]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[A]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = sub nsw [[WIDE_MASKED_GATHER6]], [[VEC_IND4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP7]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B]], [[VEC_IND]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP9]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT5]] = add [[VEC_IND4]], [[DOTSPLAT3]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[A]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[X]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP]], [[TMP1]] +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[A]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[Y]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: store i32 [[ADD]], i32* [[X5]], align 4 +; CHECK-NEXT: [[Y8:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[Y8]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP17:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %tmp = load i32, i32* %x, align 4 + %tmp1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %tmp, %tmp1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %tmp2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %tmp2, %tmp1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on an interleaved load group of factor 2 with 1 gap +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. + +; void even_load_static_tc(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; B[i/2] = A[i] * 2; +; } + + +define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) #1 { +; CHECK-LABEL: @even_load_static_tc( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP3]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = shl nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to * +; CHECK-NEXT: store [[TMP9]], * [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP19:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %tmp, 1 + %tmp1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on an interleaved load group of factor 2 with 1 gap +; (missing the load of odd elements). Because the vectorized loop would +; speculatively access memory out-of-bounds, we must execute at least one +; iteration of the scalar loop. + +; void even_load_dynamic_tc(int *A, int *B, unsigned N) { +; for (unsigned i = 0; i < N; i+=2) +; B[i/2] = A[i] * 2; +; } + + +define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) #1 { +; CHECK-LABEL: @even_load_dynamic_tc( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2) +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT_NOT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = add nuw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i64 [[TMP6]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl [[TMP9]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP10]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = and i64 [[INDEX]], 9223372036854775804 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP14:%.*]] = shl nsw [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to * +; CHECK-NEXT: store [[TMP14]], * [[TMP16]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP21:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %tmp, 1 + %tmp1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, %N + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on a reverse interleaved load group of factor 2 with 1 +; gap and a reverse interleaved store group of factor 2. The interleaved load +; group should be removed since it has a gap and is reverse. + +; struct pair { +; int x; +; int y; +; }; +; +; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { +; for (int i = 1023; i >= 0; i--) { +; int a = X + i; +; int b = A[i].y - i; +; B[i].x = a; +; B[i].y = b; +; } +; } + +;TODO: still generates gather/scatter loos like instead of a scatter we could have a st2 +%pair = type { i64, i64 } +define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) #1 { +; CHECK-LABEL: @load_gap_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add nsw i64 [[N_MOD_VF]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub shufflevector ( insertelement ( poison, i64 1023, i32 0), poison, zeroinitializer), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP3]], -4 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[DOTNEG]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw [[BROADCAST_SPLAT]], [[VEC_IND]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i64.nxv4p0i64( [[TMP6]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw [[WIDE_MASKED_GATHER]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( [[TMP4]], [[TMP5]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( [[TMP7]], [[TMP6]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_NEXT]] = add nsw i64 [[I]], -1 +; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[I]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] + %0 = add nsw i64 %X, %i + %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 + %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 + %3 = load i64, i64* %2, align 8 + %4 = sub nsw i64 %3, %i + store i64 %0, i64* %1, align 8 + store i64 %4, i64* %2, align 8 + %i.next = add nsw i64 %i, -1 + %cond = icmp sgt i64 %i, 0 + br i1 %cond, label %for.body, label %for.exit + +for.exit: + ret void +} + +; Check vectorization on interleaved access groups identified from mixed +; loads/stores. +; void mixed_load2_store2(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) { +; B[i] = A[i] * A[i+1]; +; B[i+1] = A[i] + A[i+1]; +; } +; } + + +define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) #1 { +; CHECK-LABEL: @mixed_load2_store2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl [[TMP2]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP7:%.*]] = or [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP9]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER2]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], [[TMP7]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP11]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] +; CHECK-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP25:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %tmp1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 + %tmp2 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %tmp2, %tmp + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %tmp3 = load i32, i32* %arrayidx, align 4 + %tmp4 = load i32, i32* %arrayidx2, align 4 + %add10 = add nsw i32 %tmp4, %tmp3 + %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 + store i32 %add10, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on interleaved access groups identified from mixed +; loads/stores. +; void mixed_load3_store3(int *A) { +; for (unsigned i = 0; i < 1024; i++) { +; *A++ += i; +; *A++ += i; +; *A++ += i; +; } +; } + +define void @mixed_load3_store3(i32* nocapture %A) #1 { +; CHECK-LABEL: @mixed_load3_store3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[N_VEC]], 3 +; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i64 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, [[TMP9]], i64 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP11:%.*]] = add [[WIDE_MASKED_GATHER]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP11]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, [[TMP9]], i64 2 +; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER5]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP13]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP14:%.*]] = add [[WIDE_MASKED_GATHER6]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[TMP14]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32* [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A_ADDR_012:%.*]] = phi i32* [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[A_ADDR_012]], i64 1 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[A_ADDR_012]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[I_013]] +; CHECK-NEXT: store i32 [[ADD]], i32* [[A_ADDR_012]], align 4 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[A_ADDR_012]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP1]], [[I_013]] +; CHECK-NEXT: store i32 [[ADD2]], i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i32, i32* [[A_ADDR_012]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[TMP2]], [[I_013]] +; CHECK-NEXT: store i32 [[ADD4]], i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_013]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 + %tmp = load i32, i32* %A.addr.012, align 4 + %add = add i32 %tmp, %i.013 + store i32 %add, i32* %A.addr.012, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 + %tmp1 = load i32, i32* %incdec.ptr, align 4 + %add2 = add i32 %tmp1, %i.013 + store i32 %add2, i32* %incdec.ptr, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 + %tmp2 = load i32, i32* %incdec.ptr1, align 4 + %add4 = add i32 %tmp2, %i.013 + store i32 %add4, i32* %incdec.ptr1, align 4 + %inc = add nuw nsw i32 %i.013, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Check vectorization on interleaved access groups with members having different +; kinds of type. + +; struct IntFloat { +; int a; +; float b; +; }; +; +; int SA; +; float SB; +; +; void int_float_struct(struct IntFloat *A) { +; int SumA; +; float SumB; +; for (unsigned i = 0; i < 1024; i++) { +; SumA += A[i].a; +; SumB += A[i].b; +; } +; SA = SumA; +; SB = SumB; +; } + + +%struct.IntFloat = type { i32, float } + +@SA = common global i32 0, align 4 +@SB = common global float 0.000000e+00, align 4 + +define void @int_float_struct(%struct.IntFloat* nocapture readonly %p) #0 { +; CHECK-LABEL: @int_float_struct( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ insertelement ( zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP5]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP6]] = add [[WIDE_MASKED_GATHER]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], %struct.IntFloat* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0f32( [[TMP7]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP8]] = fadd fast [[VEC_PHI]], [[WIDE_MASKED_GATHER2]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP6]]) +; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX3:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* @SA, align 4 +; CHECK-NEXT: store float [[ADD3_LCSSA]], float* @SB, align 4 +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMB_014:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUMA_013:%.*]] = phi i32 [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], %struct.IntFloat* [[P]], i64 [[INDVARS_IV]], i32 0 +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP]], [[SUMA_013]] +; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT]], %struct.IntFloat* [[P]], i64 [[INDVARS_IV]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[B]], align 4 +; CHECK-NEXT: [[ADD3]] = fadd fast float [[SUMB_014]], [[TMP1]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + store i32 %add, i32* @SA, align 4 + store float %add3, float* @SB, align 4 + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %p, i64 %indvars.iv, i32 0 + %tmp = load i32, i32* %a, align 4 + %add = add nsw i32 %tmp, %SumA.013 + %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %p, i64 %indvars.iv, i32 1 + %tmp1 = load float, float* %b, align 4 + %add3 = fadd fast float %SumB.014, %tmp1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Check vectorization of interleaved access groups in the presence of +; dependences (PR27626). The following tests check that we don't reorder +; dependent loads and stores when generating code for interleaved access +; groups. Stores should be scalarized because the required code motion would +; break dependences, and the remaining interleaved load groups should have +; gaps. + +; PR27626_0: Ensure a strided store is not moved after a dependent (zero +; distance) strided load. + +; void PR27626_0(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i].x; +; } +; } + + +%pair.i32 = type { i32, i32 } +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) #1 { +; CHECK-LABEL: @PR27626_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_1: Ensure a strided load is not moved before a dependent (zero +; distance) strided store. + +; void PR27626_1(struct pair *p, int n) { +; int s = 0; +; for (int i = 0; i < n; i++) { +; p[i].y = p[i].x; +; s += p[i].y +; } +; } + + +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define i32 @PR27626_1(%pair.i32 *%p, i64 %n) #1 { +; CHECK-LABEL: @PR27626_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP11]] = add [[WIDE_MASKED_GATHER1]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP11]]) +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP17:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP16]], i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP17]] = add nsw i32 [[TMP16]], [[S]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 [[TMP17]] +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + +; PR27626_2: Ensure a strided store is not moved after a dependent (negative +; distance) strided load. + +; void PR27626_2(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i].x = z; +; p[i].y = p[i - 1].x; +; } +; } + + +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) #1 { +; CHECK-LABEL: @PR27626_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], shufflevector ( insertelement ( poison, i64 -1, i32 0), poison, zeroinitializer), i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_minus_1 = add nuw nsw i64 %i, -1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + store i32 %z, i32* %p_i.x, align 4 + %0 = load i32, i32* %p_i_minus_1.x, align 4 + store i32 %0, i32 *%p_i.y, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_3: Ensure a strided load is not moved before a dependent (negative +; distance) strided store. + +; void PR27626_3(struct pair *p, int z, int n) { +; for (int i = 0; i < n; i++) { +; p[i + 1].y = p[i].x; +; s += p[i].y; +; } +; } + + +;TODO: uses sve masked scatter for p[i+1].y store for neon we have scalarised store +; what is actually what this test is checking +define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) #1 { +; CHECK-LABEL: @PR27626_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], [[VEC_IND]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[VEC_IND]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], [[TMP9]], i32 1 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[WIDE_MASKED_GATHER]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP13]] = add [[WIDE_MASKED_GATHER1]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 +; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 +; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[P_I_X]], align 4 +; CHECK-NEXT: store i32 [[TMP18]], i32* [[P_I_PLUS_1_Y]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[P_I_Y]], align 4 +; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 [[TMP20]] +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %s = phi i32 [ %2, %for.body ], [ 0, %entry ] + %i_plus_1 = add nuw nsw i64 %i, 1 + %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 + %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 + %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 + %0 = load i32, i32* %p_i.x, align 4 + store i32 %0, i32* %p_i_plus_1.y, align 4 + %1 = load i32, i32* %p_i.y, align 4 + %2 = add nsw i32 %1, %s + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + %3 = phi i32 [ %2, %for.body ] + ret i32 %3 +} + +; PR27626_4: Ensure we form an interleaved group for strided stores in the +; presence of a write-after-write dependence. We create a group for +; (2) and (3) while excluding (1). + +; void PR27626_4(int *a, int x, int y, int z, int n) { +; for (int i = 0; i < n; i += 2) { +; a[i] = x; // (1) +; a[i] = y; // (2) +; a[i + 1] = z; // (3) +; } +; } + +;TODO: uses sve masked scatter, but for neon we have a scalarised store for a[i] = x what is fine +define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { +; CHECK-LABEL: @PR27626_4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2) +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl [[TMP7]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = add [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], [[TMP11]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT2]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT4]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_PLUS_1:%.*]] = or i64 [[I]], 1 +; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]] +; CHECK-NEXT: store i32 [[Y]], i32* [[A_I]], align 4 +; CHECK-NEXT: store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %i_plus_1 = add i64 %i, 1 + %a_i = getelementptr inbounds i32, i32* %a, i64 %i + %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 + store i32 %x, i32* %a_i, align 4 + store i32 %y, i32* %a_i, align 4 + store i32 %z, i32* %a_i_plus_1, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR27626_5: Ensure we do not form an interleaved group for strided stores in +; the presence of a write-after-write dependence. + +; void PR27626_5(int *a, int x, int y, int z, int n) { +; for (int i = 3; i < n; i += 2) { +; a[i - 1] = x; +; a[i - 3] = y; +; a[i] = z; +; } +; } + + +;TODO: uses masked scatter, but this is a test which checks if interleaving is not used +define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { +; CHECK-LABEL: @PR27626_5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5) +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP7]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl [[TMP8]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[TMP9]], shufflevector ( insertelement ( poison, i64 3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = add [[VEC_IND]], shufflevector ( insertelement ( poison, i64 -1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP13:%.*]] = add [[VEC_IND]], shufflevector ( insertelement ( poison, i64 -3, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], [[TMP13]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT]], [[TMP15]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT2]], [[TMP16]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( [[BROADCAST_SPLAT4]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_MINUS_1:%.*]] = add i64 [[I]], -1 +; CHECK-NEXT: [[I_MINUS_3:%.*]] = add i64 [[I]], -3 +; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] +; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]] +; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]] +; CHECK-NEXT: store i32 [[X]], i32* [[A_I_MINUS_1]], align 4 +; CHECK-NEXT: store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4 +; CHECK-NEXT: store i32 [[Z]], i32* [[A_I]], align 4 +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] + %i_minus_1 = sub i64 %i, 1 + %i_minus_3 = sub i64 %i_minus_1, 2 + %a_i = getelementptr inbounds i32, i32* %a, i64 %i + %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 + %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 + store i32 %x, i32* %a_i_minus_1, align 4 + store i32 %y, i32* %a_i_minus_3, align 4 + store i32 %z, i32* %a_i, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; PR34743: Ensure that a cast which needs to sink after a load that belongs to +; an interleaved group, indeeded gets sunk. + +; void PR34743(short *a, int *b, int n) { +; for (int i = 0, iv = 0; iv < n; i++, iv += 2) { +; b[i] = a[iv] * a[iv+1] * a[iv+2]; +; } +; } + + +define void @PR34743(i16* %a, i32* %b, i64 %n) #1 { +; CHECK-LABEL: @PR34743( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[N]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw i64 [[TMP4]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = and i64 [[N]], -2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 3 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[SCEVGEP5]] to i32* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[TMP8]], [[B]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[SCEVGEP]] to i16* +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP9]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP11]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i32 [[TMP12]], 2 +; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[TMP13]], -1 +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i16 [[DOTPRE]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl [[TMP15]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP18]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP16]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 2, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[A]], [[TMP19]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP21]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !42 +; CHECK-NEXT: [[TMP22:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[A]], [[TMP20]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER8]] = call @llvm.masked.gather.nxv4i16.nxv4p0i16( [[TMP23]], i32 4, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer), poison), !alias.scope !42 +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.experimental.vector.splice.nxv4i16( [[VECTOR_RECUR]], [[WIDE_MASKED_GATHER8]], i32 -1) +; CHECK-NEXT: [[TMP25:%.*]] = sext [[TMP24]] to +; CHECK-NEXT: [[TMP26:%.*]] = sext [[WIDE_MASKED_GATHER8]] to +; CHECK-NEXT: [[TMP27:%.*]] = mul nsw [[TMP25]], [[TMP22]] +; CHECK-NEXT: [[TMP28:%.*]] = mul nsw [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to * +; CHECK-NEXT: store [[TMP28]], * [[TMP30]], align 4, !alias.scope !45, !noalias !42 +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = shl nuw nsw i64 [[TMP31]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP32]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP35:%.*]] = shl nuw nsw i32 [[TMP34]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = add nsw i32 [[TMP35]], -1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[WIDE_MASKED_GATHER8]], i32 [[TMP36]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 +; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[IV1:%.*]] = or i64 [[IV]], 1 +; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]] +; CHECK-NEXT: [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4 +; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]] +; CHECK-NEXT: [[LOAD2]] = load i16, i16* [[GEP2]], align 4 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32 +; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]] +; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP48:![0-9]+]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + %.pre = load i16, i16* %a + br label %loop + +loop: + %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ] + %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ] + %i = phi i64 [ 0, %entry ], [ %i1, %loop ] + %conv = sext i16 %0 to i32 + %i1 = add nuw nsw i64 %i, 1 + %iv1 = add nuw nsw i64 %iv, 1 + %iv2 = add nuw nsw i64 %iv, 2 + %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1 + %load1 = load i16, i16* %gep1, align 4 + %conv1 = sext i16 %load1 to i32 + %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2 + %load2 = load i16, i16* %gep2, align 4 + %conv2 = sext i16 %load2 to i32 + %mul01 = mul nsw i32 %conv, %conv1 + %mul012 = mul nsw i32 %mul01, %conv2 + %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i + store i32 %mul012, i32* %arrayidx5 + %exitcond = icmp eq i64 %iv, %n + br i1 %exitcond, label %end, label %loop + +end: + ret void +} + +attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } +attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }