Index: lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCasts.cpp +++ lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1719,7 +1719,7 @@ // cast to be exposed to other transforms. unsigned AS = CI.getAddressSpace(); if (CI.getOperand(0)->getType()->getScalarSizeInBits() != - DL.getPointerSizeInBits(AS)) { + DL.getIndexSizeInBits(AS)) { Type *Ty = DL.getIntPtrType(CI.getContext(), AS); if (CI.getType()->isVectorTy()) // Handle vectors of pointers. Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements()); Index: test/Transforms/InstCombine/ptr-int-cast_custom_gep.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/ptr-int-cast_custom_gep.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "E-p:40:64:64:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" + +define i1 @test1(i32 *%x) nounwind { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint i32* [[X:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[TMP2]] +; +entry: + %0 = ptrtoint i32* %x to i1 + ret i1 %0 +} + +define i32* @test2(i128 %x) nounwind { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i128 [[X:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to i32* +; CHECK-NEXT: ret i32* [[TMP1]] +; +entry: + %0 = inttoptr i128 %x to i32* + ret i32* %0 +} + +; PR3574 +define i64 @f0(i32 %a0) nounwind { +; CHECK-LABEL: @f0( +; CHECK-NEXT: [[T1:%.*]] = zext i32 [[A0:%.*]] to i64 +; CHECK-NEXT: ret i64 [[T1]] +; + %t0 = inttoptr i32 %a0 to i8* + %t1 = ptrtoint i8* %t0 to i64 + ret i64 %t1 +} + +define <4 x i32> @test4(<4 x i8*> %arg) nounwind { +; CHECK-LABEL: @test4( +; CHECK-NEXT: [[P1:%.*]] = ptrtoint <4 x i8*> [[ARG:%.*]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[P1]] +; + %p1 = ptrtoint <4 x i8*> %arg to <4 x i32> + ret <4 x i32> %p1 +} + +define <4 x i128> @test5(<4 x i8*> %arg) nounwind { +; CHECK-LABEL: @test5( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <4 x i8*> [[ARG:%.*]] to <4 x i32> +; CHECK-NEXT: [[P1:%.*]] = zext <4 x i32> [[TMP1]] to <4 x i128> +; CHECK-NEXT: ret <4 x i128> [[P1]] +; + %p1 = ptrtoint <4 x i8*> %arg to <4 x i128> + ret <4 x i128> %p1 +} + +define <4 x i8*> @test6(<4 x i32> %arg) nounwind { +; CHECK-LABEL: @test6( +; CHECK-NEXT: [[P1:%.*]] = inttoptr <4 x i32> [[ARG:%.*]] to <4 x i8*> +; CHECK-NEXT: ret <4 x i8*> [[P1]] +; + %p1 = inttoptr <4 x i32> %arg to <4 x i8*> + ret <4 x i8*> %p1 +} + +define <4 x i8*> @test7(<4 x i128> %arg) nounwind { +; CHECK-LABEL: @test7( +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i128> [[ARG:%.*]] to <4 x i32> +; CHECK-NEXT: [[P1:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x i8*> +; CHECK-NEXT: ret <4 x i8*> [[P1]] +; + %p1 = inttoptr <4 x i128> %arg to <4 x i8*> + ret <4 x i8*> %p1 +} Index: test/Transforms/LoopIdiom/basic_custom_dl.ll =================================================================== --- /dev/null +++ test/Transforms/LoopIdiom/basic_custom_dl.ll @@ -0,0 +1,1045 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-m:m-p:40:64:64:32-i32:32-i16:16-i8:8-n32" + + +define void @test1(i8* %Base, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test1( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE:%.*]] to i32 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[BASE:%.*]], i8 0, i32 [[TMP0]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i8, i8* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: ; preds = %entry + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + store i8 0, i8* %I.0.014, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; Make sure memset is formed for larger than 1 byte stores, and that the +; alignment of the store is preserved +define void @test1_i16(i16* align 2 %Base, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test1_i16( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[BASE1:%.*]] = bitcast i16* [[BASE:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 1 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 2 [[BASE1]], i8 0, i32 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i16, i16* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: ; preds = %entry + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i16, i16* %Base, i64 %indvar + store i16 0, i16* %I.0.014, align 2 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; This is a loop that was rotated but where the blocks weren't merged. This +; shouldn't perturb us. +define void @test1a(i8* %Base, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test1a( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE:%.*]] to i32 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[BASE:%.*]], i8 0, i32 [[TMP0]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY_CONT:%.*]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i8, i8* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: br label [[FOR_BODY_CONT]] +; CHECK: for.body.cont: +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: ; preds = %entry + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body.cont ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + store i8 0, i8* %I.0.014, align 1 + %indvar.next = add i64 %indvar, 1 + br label %for.body.cont +for.body.cont: + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + + +define void @test2(i32* %Base, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE1:%.*]] = bitcast i32* [[BASE:%.*]] to i8* +; CHECK-NEXT: [[CMP10:%.*]] = icmp eq i64 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 2 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 4 [[BASE1]], i8 1, i32 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[I_011]] +; CHECK-NEXT: [[INC]] = add nsw i64 [[I_011]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp10 = icmp eq i64 %Size, 0 + br i1 %cmp10, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.011 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %add.ptr.i = getelementptr i32, i32* %Base, i64 %i.011 + store i32 16843009, i32* %add.ptr.i, align 4 + %inc = add nsw i64 %i.011, 1 + %exitcond = icmp eq i64 %inc, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; This is a case where there is an extra may-aliased store in the loop, we can't +; promote the memset. +define void @test3(i32* %Base, i64 %Size, i8 *%MayAlias) nounwind ssp { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[I_011]] +; CHECK-NEXT: store i32 16843009, i32* [[ADD_PTR_I]], align 4 +; CHECK-NEXT: store i8 42, i8* [[MAYALIAS:%.*]] +; CHECK-NEXT: [[INC]] = add nsw i64 [[I_011]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[SIZE:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.011 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %add.ptr.i = getelementptr i32, i32* %Base, i64 %i.011 + store i32 16843009, i32* %add.ptr.i, align 4 + + store i8 42, i8* %MayAlias + %inc = add nsw i64 %i.011, 1 + %exitcond = icmp eq i64 %inc, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %entry + ret void +} + +; Make sure the first store in the loop is turned into a memset. +define void @test4(i8* %Base) nounwind ssp { +; CHECK-LABEL: @test4( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[BASE100:%.*]] = getelementptr i8, i8* [[BASE:%.*]], i64 1000 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[BASE]], i8 0, i32 100, i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i8, i8* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: store i8 42, i8* [[BASE100]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 100 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: ; preds = %entry + %Base100 = getelementptr i8, i8* %Base, i64 1000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + store i8 0, i8* %I.0.014, align 1 + + ;; Store beyond the range memset, should be safe to promote. + store i8 42, i8* %Base100 + + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; This can't be promoted: the memset is a store of a loop variant value. +define void @test5(i8* %Base, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test5( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i8, i8* [[BASE:%.*]], i64 [[INDVAR]] +; CHECK-NEXT: [[V:%.*]] = trunc i64 [[INDVAR]] to i8 +; CHECK-NEXT: store i8 [[V]], i8* [[I_0_014]], align 1 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: ; preds = %entry + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + + %V = trunc i64 %indvar to i8 + store i8 %V, i8* %I.0.014, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + + +;; memcpy formation +define void @test6(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test6( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[BASE:%.*]] = alloca i8, i32 10000 +; CHECK-NEXT: [[DEST:%.*]] = alloca i8, i32 10000 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE:%.*]] to i32 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[BASE]], i32 [[TMP0]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i8, i8* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr i8, i8* [[DEST]], i64 [[INDVAR]] +; CHECK-NEXT: [[V:%.*]] = load i8, i8* [[I_0_014]], align 1 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + %Base = alloca i8, i32 10000 + %Dest = alloca i8, i32 10000 + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + %DestI = getelementptr i8, i8* %Dest, i64 %indvar + %V = load i8, i8* %I.0.014, align 1 + store i8 %V, i8* %DestI, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memcpy formation, check alignment +define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test6_dest_align( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[DEST1:%.*]] = bitcast i32* [[DEST:%.*]] to i8* +; CHECK-NEXT: [[BASE2:%.*]] = bitcast i32* [[BASE:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 2 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[DEST1]], i8* align 1 [[BASE2]], i32 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr i32, i32* [[DEST]], i64 [[INDVAR]] +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[I_0_014]], align 1 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar + %DestI = getelementptr i32, i32* %Dest, i64 %indvar + %V = load i32, i32* %I.0.014, align 1 + store i32 %V, i32* %DestI, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; memcpy formation, check alignment +define void @test6_src_align(i32* noalias align 4 %Base, i32* noalias align 1 %Dest, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test6_src_align( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[DEST1:%.*]] = bitcast i32* [[DEST:%.*]] to i8* +; CHECK-NEXT: [[BASE2:%.*]] = bitcast i32* [[BASE:%.*]] to i8* +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 2 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[DEST1]], i8* align 4 [[BASE2]], i32 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr i32, i32* [[DEST]], i64 [[INDVAR]] +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[I_0_014]], align 4 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar + %DestI = getelementptr i32, i32* %Dest, i64 %indvar + %V = load i32, i32* %I.0.014, align 4 + store i32 %V, i32* %DestI, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + + +; This is a loop that was rotated but where the blocks weren't merged. This +; shouldn't perturb us. +define void @test7(i8* %Base, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test7( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[SIZE:%.*]] to i32 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[BASE:%.*]], i8 0, i32 [[TMP0]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY_CONT:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY_CONT]] +; CHECK: for.body.cont: +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i8, i8* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: ; preds = %entry + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body.cont ] + br label %for.body.cont +for.body.cont: + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + store i8 0, i8* %I.0.014, align 1 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; This is a loop should not be transformed, it only executes one iteration. +define void @test8(i64* %Ptr, i64 %Size) nounwind ssp { +; CHECK-LABEL: @test8( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[PI:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i64 [[INDVAR]] +; CHECK-NEXT: store i64 0, i64* [[PI]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 1 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: ; preds = %entry + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %PI = getelementptr i64, i64* %Ptr, i64 %indvar + store i64 0, i64 *%PI + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 1 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare i8* @external(i8*) + +;; This cannot be transformed into a memcpy, because the read-from location is +;; mutated by the loop. +define void @test9(i64 %Size) nounwind ssp { +; CHECK-LABEL: @test9( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[BASE:%.*]] = alloca i8, i32 10000 +; CHECK-NEXT: [[DEST:%.*]] = alloca i8, i32 10000 +; CHECK-NEXT: [[BASEALIAS:%.*]] = call i8* @external(i8* [[BASE]]) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_014:%.*]] = getelementptr i8, i8* [[BASE]], i64 [[INDVAR]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr i8, i8* [[DEST]], i64 [[INDVAR]] +; CHECK-NEXT: [[V:%.*]] = load i8, i8* [[I_0_014]], align 1 +; CHECK-NEXT: store i8 [[V]], i8* [[DESTI]], align 1 +; CHECK-NEXT: store i8 4, i8* [[BASEALIAS]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + %Base = alloca i8, i32 10000 + %Dest = alloca i8, i32 10000 + + %BaseAlias = call i8* @external(i8* %Base) + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %I.0.014 = getelementptr i8, i8* %Base, i64 %indvar + %DestI = getelementptr i8, i8* %Dest, i64 %indvar + %V = load i8, i8* %I.0.014, align 1 + store i8 %V, i8* %DestI, align 1 + + ;; This store can clobber the input. + store i8 4, i8* %BaseAlias + + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; Two dimensional nested loop should be promoted to one big memset. +define void @test10(i8* %X) nounwind ssp { +; CHECK-LABEL: @test10( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[X:%.*]], i8 0, i32 10000, i1 false) +; CHECK-NEXT: br label [[BB_NPH:%.*]] +; CHECK: bb.nph: +; CHECK-NEXT: [[I_04:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC12:%.*]], [[FOR_INC10:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[I_04]], 100 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[X]], i32 [[TMP0]] +; CHECK-NEXT: br label [[FOR_BODY5:%.*]] +; CHECK: for.body5: +; CHECK-NEXT: [[J_02:%.*]] = phi i32 [ 0, [[BB_NPH]] ], [ [[INC:%.*]], [[FOR_BODY5]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_04]], 100 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[J_02]], [[MUL]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[INC]] = add nsw i32 [[J_02]], 1 +; CHECK-NEXT: [[CMP4:%.*]] = icmp eq i32 [[INC]], 100 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_INC10]], label [[FOR_BODY5]] +; CHECK: for.inc10: +; CHECK-NEXT: [[INC12]] = add nsw i32 [[I_04]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC12]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END13:%.*]], label [[BB_NPH]] +; CHECK: for.end13: +; CHECK-NEXT: ret void +; +entry: + br label %bb.nph + +bb.nph: ; preds = %entry, %for.inc10 + %i.04 = phi i32 [ 0, %entry ], [ %inc12, %for.inc10 ] + br label %for.body5 + +for.body5: ; preds = %for.body5, %bb.nph + %j.02 = phi i32 [ 0, %bb.nph ], [ %inc, %for.body5 ] + %mul = mul nsw i32 %i.04, 100 + %add = add nsw i32 %j.02, %mul + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i8, i8* %X, i64 %idxprom + store i8 0, i8* %arrayidx, align 1 + %inc = add nsw i32 %j.02, 1 + %cmp4 = icmp eq i32 %inc, 100 + br i1 %cmp4, label %for.inc10, label %for.body5 + +for.inc10: ; preds = %for.body5 + %inc12 = add nsw i32 %i.04, 1 + %cmp = icmp eq i32 %inc12, 100 + br i1 %cmp, label %for.end13, label %bb.nph + +for.end13: ; preds = %for.inc10 + ret void +} + +; On darwin10 (which is the triple in this .ll file) this loop can be turned +; into a memset_pattern call. +; rdar://9009151 +define void @test11_pattern(i32* nocapture %P) nounwind ssp { +; CHECK-LABEL: @test11_pattern( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[INDVAR]] +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 10000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] + %arrayidx = getelementptr i32, i32* %P, i64 %indvar + store i32 1, i32* %arrayidx, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Store of null should turn into memset of zero. +define void @test12(i32** nocapture %P) nounwind ssp { +; CHECK-LABEL: @test12( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32*, i32** [[P:%.*]], i64 [[INDVAR]] +; CHECK-NEXT: store i32* null, i32** [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 10000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] + %arrayidx = getelementptr i32*, i32** %P, i64 %indvar + store i32* null, i32** %arrayidx, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +@G = global i32 5 + +; This store-of-address loop can be turned into a memset_pattern call. +; rdar://9009151 +define void @test13_pattern(i32** nocapture %P) nounwind ssp { +; CHECK-LABEL: @test13_pattern( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32*, i32** [[P:%.*]], i64 [[INDVAR]] +; CHECK-NEXT: store i32* @G, i32** [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 10000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] + %arrayidx = getelementptr i32*, i32** %P, i64 %indvar + store i32* @G, i32** %arrayidx, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + + + +; PR9815 - This is a partial overlap case that cannot be safely transformed +; into a memcpy. +@g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16 + +define i32 @test14() nounwind { +; CHECK-LABEL: @test14( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], 4 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [7 x i32], [7 x i32]* @g_50, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP5]], 5 +; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [7 x i32], [7 x i32]* @g_50, i32 0, i64 [[IDXPROM5]] +; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[INC]] = add nsw i32 [[TMP5]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], 2 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([7 x i32], [7 x i32]* @g_50, i32 0, i64 6), align 4 +; CHECK-NEXT: ret i32 [[TMP8]] +; +entry: + br label %for.body + +for.body: ; preds = %for.inc, %for.body.lr.ph + %tmp5 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %add = add nsw i32 %tmp5, 4 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds [7 x i32], [7 x i32]* @g_50, i32 0, i64 %idxprom + %tmp2 = load i32, i32* %arrayidx, align 4 + %add4 = add nsw i32 %tmp5, 5 + %idxprom5 = sext i32 %add4 to i64 + %arrayidx6 = getelementptr inbounds [7 x i32], [7 x i32]* @g_50, i32 0, i64 %idxprom5 + store i32 %tmp2, i32* %arrayidx6, align 4 + %inc = add nsw i32 %tmp5, 1 + %cmp = icmp slt i32 %inc, 2 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.inc + %tmp8 = load i32, i32* getelementptr inbounds ([7 x i32], [7 x i32]* @g_50, i32 0, i64 6), align 4 + ret i32 %tmp8 + +} + +define void @PR14241(i32* %s, i64 %size) { +; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught +; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy +; instead of a memmove. If we get the memmove transform back, this will catch +; regressions. +; +; CHECK-LABEL: @PR14241( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[END_IDX:%.*]] = add i64 [[SIZE:%.*]], -1 +; CHECK-NEXT: [[END_PTR:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 [[END_IDX]] +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[PHI_PTR:%.*]] = phi i32* [ [[S]], [[ENTRY:%.*]] ], [ [[NEXT_PTR:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[SRC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PHI_PTR]], i64 1 +; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[SRC_PTR]], align 4 +; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds i32, i32* [[PHI_PTR]], i64 0 +; CHECK-NEXT: store i32 [[VAL]], i32* [[DST_PTR]], align 4 +; CHECK-NEXT: [[NEXT_PTR]] = getelementptr inbounds i32, i32* [[PHI_PTR]], i64 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[NEXT_PTR]], [[END_PTR]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[WHILE_BODY]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %end.idx = add i64 %size, -1 + %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx + br label %while.body +; FIXME: When we regain the ability to form a memmove here, this test should be +; reversed and turned into a positive assertion. + +while.body: + %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ] + %src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 + %val = load i32, i32* %src.ptr, align 4 + %dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0 + store i32 %val, i32* %dst.ptr, align 4 + %next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1 + %cmp = icmp eq i32* %next.ptr, %end.ptr + br i1 %cmp, label %exit, label %while.body + +exit: + ret void +} + +; Recognize loops with a negative stride. +define void @test15(i32* nocapture %f) { +; CHECK-LABEL: @test15( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[F1:%.*]] = bitcast i32* [[F:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 4 [[F1]], i8 0, i32 262148, i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 65536, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 0, i32* %arrayidx, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +; Loop with a negative stride. Verify an aliasing write to f[65536] prevents +; the creation of a memset. +define void @test16(i32* nocapture %f) { +; CHECK-LABEL: @test16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 65536 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 65536, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + %arrayidx1 = getelementptr inbounds i32, i32* %f, i64 65536 + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 65536, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 0, i32* %arrayidx, align 4 + store i32 1, i32* %arrayidx1, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void +} + +; Handle memcpy-able loops with negative stride. +define noalias i32* @test17(i32* nocapture readonly %a, i32 %c) { +; CHECK-LABEL: @test17( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[C:%.*]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[CONV]], 2 +; CHECK-NEXT: [[CALL:%.*]] = tail call noalias i8* @malloc(i64 [[MUL]]) +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[CALL]] to i32* +; CHECK-NEXT: [[TOBOOL_9:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] +; CHECK: while.body.preheader: +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[C]], 2 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[CALL]], i8* align 4 [[A1]], i32 [[TMP1]], i1 false) +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[DEC10_IN:%.*]] = phi i32 [ [[DEC10:%.*]], [[WHILE_BODY]] ], [ [[C]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[DEC10]] = add nsw i32 [[DEC10_IN]], -1 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[DEC10]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[DEC10]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: ret i32* [[TMP0]] +; +entry: + %conv = sext i32 %c to i64 + %mul = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul) + %0 = bitcast i8* %call to i32* + %tobool.9 = icmp eq i32 %c, 0 + br i1 %tobool.9, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %dec10.in = phi i32 [ %dec10, %while.body ], [ %c, %while.body.preheader ] + %dec10 = add nsw i32 %dec10.in, -1 + %idxprom = sext i32 %dec10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %0, i64 %idxprom + store i32 %1, i32* %arrayidx2, align 4 + %tobool = icmp eq i32 %dec10, 0 + br i1 %tobool, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + ret i32* %0 +} + +declare noalias i8* @malloc(i64) + +; Handle memcpy-able loops with negative stride. +; void test18(unsigned *__restrict__ a, unsigned *__restrict__ b) { +; for (int i = 2047; i >= 0; --i) { +; a[i] = b[i]; +; } +; } +define void @test18(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) #0 { +; CHECK-LABEL: @test18( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[A1]], i8* align 4 [[B2]], i32 8192, i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 2047, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 2047, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void +} + +; Two dimensional nested loop with negative stride should be promoted to one big memset. +define void @test19(i8* nocapture %X) { +; CHECK-LABEL: @test19( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 1 [[X:%.*]], i8 0, i32 10000, i1 false) +; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] +; CHECK: for.cond1.preheader: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_INC4:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ 99, [[ENTRY]] ], [ [[DEC5:%.*]], [[FOR_INC4]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[INDVAR]], -100 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 9900 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[X]], i32 [[TMP1]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_06]], 100 +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[J_05:%.*]] = phi i32 [ 99, [[FOR_COND1_PREHEADER]] ], [ [[DEC:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[J_05]], [[MUL]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i64 [[IDXPROM]] +; CHECK-NEXT: [[DEC]] = add nsw i32 [[J_05]], -1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[J_05]], 0 +; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_INC4]] +; CHECK: for.inc4: +; CHECK-NEXT: [[DEC5]] = add nsw i32 [[I_06]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[I_06]], 0 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1_PREHEADER]], label [[FOR_END6:%.*]] +; CHECK: for.end6: +; CHECK-NEXT: ret void +; +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %entry, %for.inc4 + %i.06 = phi i32 [ 99, %entry ], [ %dec5, %for.inc4 ] + %mul = mul nsw i32 %i.06, 100 + br label %for.body3 + +for.body3: ; preds = %for.cond1.preheader, %for.body3 + %j.05 = phi i32 [ 99, %for.cond1.preheader ], [ %dec, %for.body3 ] + %add = add nsw i32 %j.05, %mul + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i8, i8* %X, i64 %idxprom + store i8 0, i8* %arrayidx, align 1 + %dec = add nsw i32 %j.05, -1 + %cmp2 = icmp sgt i32 %j.05, 0 + br i1 %cmp2, label %for.body3, label %for.inc4 + +for.inc4: ; preds = %for.body3 + %dec5 = add nsw i32 %i.06, -1 + %cmp = icmp sgt i32 %i.06, 0 + br i1 %cmp, label %for.cond1.preheader, label %for.end6 + +for.end6: ; preds = %for.inc4 + ret void +} + +; Handle loops where the trip count is a narrow integer that needs to be +; extended. +define void @form_memset_narrow_size(i64* %ptr, i32 %size) { +; CHECK-LABEL: @form_memset_narrow_size( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR1:%.*]] = bitcast i64* [[PTR:%.*]] to i8* +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP_PH:%.*]], label [[EXIT:%.*]] +; CHECK: loop.ph: +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[SIZE]], 3 +; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[PTR1]], i8 0, i32 [[TMP0]], i1 false) +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: loop.body: +; CHECK-NEXT: [[STOREMERGE4:%.*]] = phi i32 [ 0, [[LOOP_PH]] ], [ [[INC:%.*]], [[LOOP_BODY]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STOREMERGE4]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 [[IDXPROM]] +; CHECK-NEXT: [[INC]] = add nsw i32 [[STOREMERGE4]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[INC]], [[SIZE]] +; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP_BODY]], label [[LOOP_EXIT:%.*]] +; CHECK: loop.exit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp sgt i32 %size, 0 + br i1 %cmp1, label %loop.ph, label %exit + +loop.ph: + br label %loop.body + +loop.body: + %storemerge4 = phi i32 [ 0, %loop.ph ], [ %inc, %loop.body ] + %idxprom = sext i32 %storemerge4 to i64 + %arrayidx = getelementptr inbounds i64, i64* %ptr, i64 %idxprom + store i64 0, i64* %arrayidx, align 8 + %inc = add nsw i32 %storemerge4, 1 + %cmp2 = icmp slt i32 %inc, %size + br i1 %cmp2, label %loop.body, label %loop.exit + +loop.exit: + br label %exit + +exit: + ret void +} + +define void @form_memcpy_narrow_size(i64* noalias %dst, i64* noalias %src, i32 %size) { +; CHECK-LABEL: @form_memcpy_narrow_size( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DST1:%.*]] = bitcast i64* [[DST:%.*]] to i8* +; CHECK-NEXT: [[SRC2:%.*]] = bitcast i64* [[SRC:%.*]] to i8* +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP_PH:%.*]], label [[EXIT:%.*]] +; CHECK: loop.ph: +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[SIZE]], 3 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[DST1]], i8* align 8 [[SRC2]], i32 [[TMP0]], i1 false) +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: loop.body: +; CHECK-NEXT: [[STOREMERGE4:%.*]] = phi i32 [ 0, [[LOOP_PH]] ], [ [[INC:%.*]], [[LOOP_BODY]] ] +; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[STOREMERGE4]] to i64 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[V:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[IDXPROM2:%.*]] = sext i32 [[STOREMERGE4]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[IDXPROM2]] +; CHECK-NEXT: [[INC]] = add nsw i32 [[STOREMERGE4]], 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[INC]], [[SIZE]] +; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP_BODY]], label [[LOOP_EXIT:%.*]] +; CHECK: loop.exit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp sgt i32 %size, 0 + br i1 %cmp1, label %loop.ph, label %exit + +loop.ph: + br label %loop.body + +loop.body: + %storemerge4 = phi i32 [ 0, %loop.ph ], [ %inc, %loop.body ] + %idxprom1 = sext i32 %storemerge4 to i64 + %arrayidx1 = getelementptr inbounds i64, i64* %src, i64 %idxprom1 + %v = load i64, i64* %arrayidx1, align 8 + %idxprom2 = sext i32 %storemerge4 to i64 + %arrayidx2 = getelementptr inbounds i64, i64* %dst, i64 %idxprom2 + store i64 %v, i64* %arrayidx2, align 8 + %inc = add nsw i32 %storemerge4, 1 + %cmp2 = icmp slt i32 %inc, %size + br i1 %cmp2, label %loop.body, label %loop.exit + +loop.exit: + br label %exit + +exit: + ret void +} Index: test/Transforms/LoopVectorize/runtime-check_custom_dl.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/runtime-check_custom_dl.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-m:m-p:40:64:64:32-i32:32-i16:16-i8:8-n32" + +; Make sure we vectorize this loop: +; int foo(float *a, float *b, int n) { +; for (int i=0; i*, !dbg !9 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !dbg !9, !alias.scope !10 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD]], , !dbg !9 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDEX]] to i32, !dbg !9 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[TMP8]], !dbg !9 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*, !dbg !9 +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4, !dbg !9, !alias.scope !13, !noalias !10 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4, !dbg !9 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg !9 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg !9, !llvm.loop !15 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg !9 +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg !9 +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], !dbg !9 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV]] to i32, !dbg !9 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[TMP12]], !dbg !9 +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg !9 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP13]], 3.000000e+00, !dbg !9 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV]] to i32, !dbg !9 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[TMP14]], !dbg !9 +; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX2]], align 4, !dbg !9 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1, !dbg !9 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg !9 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg !9 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !dbg !9, !llvm.loop !17 +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]], !dbg !18 +; CHECK: for.end: +; CHECK-NEXT: ret i32 undef, !dbg !18 +; +entry: + %cmp6 = icmp sgt i32 %n, 0, !dbg !6 + br i1 %cmp6, label %for.body, label %for.end, !dbg !6 + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ], !dbg !7 + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv, !dbg !7 + %0 = load float, float* %arrayidx, align 4, !dbg !7 + %mul = fmul float %0, 3.000000e+00, !dbg !7 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv, !dbg !7 + store float %mul, float* %arrayidx2, align 4, !dbg !7 + %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !7 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !7 + %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !7 + br i1 %exitcond, label %for.end, label %for.body, !dbg !7 + +for.end: ; preds = %for.body, %entry + ret i32 undef, !dbg !8 +} + +; Make sure that we try to vectorize loops with a runtime check if the +; dependency check fails. + +; CHECK-LABEL: test_runtime_check +; CHECK: <4 x float> +define void @test_runtime_check(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) { +; CHECK-LABEL: @test_runtime_check( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET:%.*]] to i32 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[N]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[A]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[OFFSET2:%.*]] to i32 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[A]], i32 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult float* [[SCEVGEP]], [[SCEVGEP6]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult float* [[SCEVGEP4]], [[SCEVGEP2]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT9]], <4 x float> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], [[OFFSET]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP8]], align 4, !alias.scope !19, !noalias !22 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[OFFSET2]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP12]], align 4, !alias.scope !22 +; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT10]], [[WIDE_LOAD8]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP7]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP15]], align 4, !alias.scope !19, !noalias !22 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IND_SUM:%.*]] = add i64 [[IV]], [[OFFSET]] +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[IND_SUM]] to i32 +; CHECK-NEXT: [[ARR_IDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[TMP17]] +; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARR_IDX]], align 4 +; CHECK-NEXT: [[IND_SUM2:%.*]] = add i64 [[IV]], [[OFFSET2]] +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[IND_SUM2]] to i32 +; CHECK-NEXT: [[ARR_IDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[TMP18]] +; CHECK-NEXT: [[L2:%.*]] = load float, float* [[ARR_IDX2]], align 4 +; CHECK-NEXT: [[M:%.*]] = fmul fast float [[L2]], [[B]] +; CHECK-NEXT: [[AD:%.*]] = fadd fast float [[L1]], [[M]] +; CHECK-NEXT: store float [[AD]], float* [[ARR_IDX]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !25 +; CHECK: loopexit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ind.sum = add i64 %iv, %offset + %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum + %l1 = load float, float* %arr.idx, align 4 + %ind.sum2 = add i64 %iv, %offset2 + %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2 + %l2 = load float, float* %arr.idx2, align 4 + %m = fmul fast float %b, %l2 + %ad = fadd fast float %l1, %m + store float %ad, float* %arr.idx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %loopexit, label %for.body + +loopexit: + ret void +} + +; CHECK: !9 = !DILocation(line: 101, column: 1, scope: !{{.*}}) + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!9} +!0 = !{i32 2, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} + +!2 = !{} +!3 = !DISubroutineType(types: !2) +!4 = !DIFile(filename: "test.cpp", directory: "/tmp") +!5 = distinct !DISubprogram(name: "foo", scope: !4, file: !4, line: 99, type: !3, isLocal: false, isDefinition: true, scopeLine: 100, flags: DIFlagPrototyped, isOptimized: false, unit: !9, retainedNodes: !2) +!6 = !DILocation(line: 100, column: 1, scope: !5) +!7 = !DILocation(line: 101, column: 1, scope: !5) +!8 = !DILocation(line: 102, column: 1, scope: !5) +!9 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", + file: !10, + isOptimized: true, flags: "-O2", + splitDebugFilename: "abc.debug", emissionKind: 2) +!10 = !DIFile(filename: "path/to/file", directory: "/path/to/dir") +!11 = !{i32 2, !"Debug Info Version", i32 3} Index: test/Transforms/SeparateConstOffsetFromGEP/custom_dl.ll =================================================================== --- /dev/null +++ test/Transforms/SeparateConstOffsetFromGEP/custom_dl.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -separate-const-offset-from-gep -reassociate-geps-verify-no-dead-code -gvn < %s | FileCheck %s + +target datalayout = "e-m:m-p:40:64:64:32-i32:32-i16:16-i8:8-n32" + +@array = internal addrspace(4) constant [4096 x [32 x float]] zeroinitializer, align 4 + +define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) { +; CHECK-LABEL: @sum_of_array( +; CHECK-NEXT: [[TMP:%.*]] = sext i32 [[Y:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i32 0, i32 [[X]], i32 [[Y]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, float addrspace(4)* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[Y]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i32 0, i32 [[X]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, float addrspace(4)* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[X]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i32 0, i32 [[TMP12]], i32 [[Y]] +; CHECK-NEXT: [[TMP16:%.*]] = load float, float addrspace(4)* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i32 0, i32 [[TMP12]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP20:%.*]] = load float, float addrspace(4)* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]] +; CHECK-NEXT: store float [[TMP21]], float addrspace(1)* [[OUTPUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %tmp = sext i32 %y to i64 + %tmp1 = sext i32 %x to i64 + %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp + %tmp4 = load float, float addrspace(4)* %tmp2, align 4 + %tmp5 = fadd float %tmp4, 0.000000e+00 + %tmp6 = add i32 %y, 1 + %tmp7 = sext i32 %tmp6 to i64 + %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp7 + %tmp10 = load float, float addrspace(4)* %tmp8, align 4 + %tmp11 = fadd float %tmp5, %tmp10 + %tmp12 = add i32 %x, 1 + %tmp13 = sext i32 %tmp12 to i64 + %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp + %tmp16 = load float, float addrspace(4)* %tmp14, align 4 + %tmp17 = fadd float %tmp11, %tmp16 + %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp7 + %tmp20 = load float, float addrspace(4)* %tmp18, align 4 + %tmp21 = fadd float %tmp17, %tmp20 + store float %tmp21, float addrspace(1)* %output, align 4 + ret void +}