Index: lib/Analysis/ValueTracking.cpp =================================================================== --- lib/Analysis/ValueTracking.cpp +++ lib/Analysis/ValueTracking.cpp @@ -2016,11 +2016,33 @@ if (isKnownNonNullFromDominatingCondition(V, Q.CxtI, Q.DT)) return true; + // Look through bitcast operations, GEPs, and int2ptr instructions as they + // do not alter the value, or at least not the nullness property of the + // value, e.g., int2ptr is allowed to zero/sign extend the value. + // + // Note that we have to take special care to avoid looking through + // truncating casts, e.g., int2ptr/ptr2int with appropriate sizes, as well + // as casts that can alter the value, e.g., AddrSpaceCasts. if (const GEPOperator *GEP = dyn_cast(V)) if (isGEPKnownNonNull(GEP, Depth, Q)) return true; + + if (auto *BCO = dyn_cast(V)) + return isKnownNonZero(BCO->getOperand(0), Depth, Q); + + if (auto *I2P = dyn_cast(V)) + if (Q.DL.getTypeSizeInBits(I2P->getSrcTy()) <= + Q.DL.getTypeSizeInBits(I2P->getDestTy())) + return isKnownNonZero(I2P->getOperand(0), Depth, Q); } + // Similar to int2ptr above, we can look through ptr2int here if the cast + // is a no-op or an extend and not a truncate. + if (auto *P2I = dyn_cast(V)) + if (Q.DL.getTypeSizeInBits(P2I->getSrcTy()) <= + Q.DL.getTypeSizeInBits(P2I->getDestTy())) + return isKnownNonZero(P2I->getOperand(0), Depth, Q); + unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL); // X | Y != 0 if X != 0 or Y != 0. Index: test/Transforms/InstCombine/alloca-cast-debuginfo.ll =================================================================== --- test/Transforms/InstCombine/alloca-cast-debuginfo.ll +++ test/Transforms/InstCombine/alloca-cast-debuginfo.ll @@ -45,7 +45,7 @@ ; Another dbg.value for "local" would be redundant here. ; CHECK-NOT: call void @llvm.dbg.value(metadata i8* [[simplified]], metadata !22, metadata !DIExpression()) ; -; CHECK: call void @escape(i8* [[simplified]]) +; CHECK: call void @escape(i8* nonnull [[simplified]]) ; CHECK: ret void declare void @llvm.dbg.declare(metadata, metadata, metadata) Index: test/Transforms/InstCombine/callsite_nonnull_args_through_casts.ll =================================================================== --- /dev/null +++ test/Transforms/InstCombine/callsite_nonnull_args_through_casts.ll @@ -0,0 +1,99 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare void @foo(i8*) +declare void @bar(i8 addrspace(1)*) + +define void @nonnullAfterBitCast() { +entry: + %i = alloca i32, align 4 + %tmp1 = bitcast i32* %i to i8* +; CHECK: call void @foo(i8* nonnull %tmp1) + call void @foo(i8* %tmp1) + ret void +} + +define void @nonnullAfterSExt(i8 %a) { +entry: + %b = zext i8 %a to i32 ; <- %b is >= 0 + %c = add nsw nuw i32 %b, 2 ; <- %c is > 0 + %sext = sext i32 %c to i64 ; <- %sext cannot be 0 because %c is not 0 + %i2p = inttoptr i64 %sext to i8* ; <- no-op int2ptr cast +; CHECK: call void @foo(i8* nonnull %i2p) + call void @foo(i8* %i2p) + ret void +} + +define void @nonnullAfterZExt(i8 %a) { +entry: + %b = zext i8 %a to i32 ; <- %b is >= 0 + %c = add nsw nuw i32 %b, 2 ; <- %c is > 0 + %zext = zext i32 %c to i64 ; <- %zext cannot be 0 because %c is not 0 + %i2p = inttoptr i64 %zext to i8* ; <- no-op int2ptr cast +; CHECK: call void @foo(i8* nonnull %i2p) + call void @foo(i8* %i2p) + ret void +} + +declare void @llvm.assume(i1 %b) + +define void @nonnullAfterInt2Ptr(i32 %u, i64 %lu) { +entry: + %nz = sdiv exact i32 100, %u ; %nz cannot be null + %i2p = inttoptr i32 %nz to i8* ; extending int2ptr as sizeof(i32) < sizeof(i8*) +; CHECK: call void @foo(i8* nonnull %i2p) + call void @foo(i8* %i2p) + + %nz.2 = sdiv exact i64 100, %lu ; %nz.2 cannot be null + %i2p.2 = inttoptr i64 %nz.2 to i8* ; no-op int2ptr as sizeof(i64) == sizeof(i8*) +; CHECK: call void @foo(i8* nonnull %i2p.2) + call void @foo(i8* %i2p.2) + ret void +} + +define void @nonnullAfterPtr2Int() { +entry: + %a = alloca i32 + %p2i = ptrtoint i32* %a to i64 ; no-op ptr2int as sizeof(i32*) == sizeof(i64) + %i2p = inttoptr i64 %p2i to i8* +; CHECK: call void @foo(i8* nonnull %i2p) + call void @foo(i8* %i2p) + ret void +} + +define void @maybenullAfterInt2Ptr(i128 %llu) { +entry: + %cmp = icmp ne i128 %llu, 0 + call void @llvm.assume(i1 %cmp) ; %llu != 0 + %i2p = inttoptr i128 %llu to i8* ; truncating int2ptr as sizeof(i128) > sizeof(i8*) +; CHECK: call void @foo(i8* %i2p) + call void @foo(i8* %i2p) + ret void +} + +define void @maybenullAfterPtr2Int() { +entry: + %a = alloca i32 + %p2i = ptrtoint i32* %a to i32 ; truncating ptr2int as sizeof(i32*) > sizeof(i32) + %i2p = inttoptr i32 %p2i to i8* +; CHECK: call void @foo(i8* %i2p) + call void @foo(i8* %i2p) + ret void +} + +define void @maybenullAfterAddrspacecast(i8* nonnull %p) { +entry: + %addrspcast = addrspacecast i8* %p to i8 addrspace(1)* + +; An address space cast can be "a no-op cast or a complex value modification, +; depending on the target and the address space pair". As a consequence, we +; cannot simply assume non-nullness of %p is preserved by the cast. +; +; CHECK: call void @bar(i8 addrspace(1)* %addrspcast) + call void @bar(i8 addrspace(1)* %addrspcast) + +; CHECK: call void @foo(i8* nonnull %p) + call void @foo(i8* %p) + ret void +} Index: test/Transforms/InstCombine/objsize.ll =================================================================== --- test/Transforms/InstCombine/objsize.ll +++ test/Transforms/InstCombine/objsize.ll @@ -114,7 +114,7 @@ %1 = bitcast %struct.data* %0 to i8* %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false, i1 false) nounwind ; CHECK-NOT: @llvm.objectsize -; CHECK: @llvm.memset.p0i8.i32(i8* align 8 %1, i8 0, i32 1824, i1 false) +; CHECK: @llvm.memset.p0i8.i32(i8* nonnull align 8 %1, i8 0, i32 1824, i1 false) %3 = call i8* @__memset_chk(i8* %1, i32 0, i32 1824, i32 %2) nounwind store i8* %1, i8** %esc ret i32 0 Index: test/Transforms/InstCombine/select.ll =================================================================== --- test/Transforms/InstCombine/select.ll +++ test/Transforms/InstCombine/select.ll @@ -1141,7 +1141,7 @@ ; CHECK-NEXT: [[Y:%.*]] = alloca i32, align 4 ; CHECK-NEXT: [[X1:%.*]] = bitcast float* [[X]] to i32* ; CHECK-NEXT: [[Y1:%.*]] = bitcast i32* [[Y]] to float* -; CHECK-NEXT: call void @scribble_on_i32(i32* [[X1]]) +; CHECK-NEXT: call void @scribble_on_i32(i32* nonnull [[X1]]) ; CHECK-NEXT: call void @scribble_on_i32(i32* nonnull [[Y]]) ; CHECK-NEXT: [[TMP:%.*]] = load float, float* [[X]], align 4 ; CHECK-NEXT: store float [[TMP]], float* [[Y1]], align 4 @@ -1172,8 +1172,8 @@ ; CHECK-NEXT: [[Y:%.*]] = alloca i8*, align 8 ; CHECK-NEXT: [[TMPCAST:%.*]] = bitcast i8** [[Y]] to i64* ; CHECK-NEXT: [[X1:%.*]] = bitcast i8** [[X]] to i64* -; CHECK-NEXT: call void @scribble_on_i64(i64* [[X1]]) -; CHECK-NEXT: call void @scribble_on_i64(i64* [[TMPCAST]]) +; CHECK-NEXT: call void @scribble_on_i64(i64* nonnull [[X1]]) +; CHECK-NEXT: call void @scribble_on_i64(i64* nonnull [[TMPCAST]]) ; CHECK-NEXT: [[TMP:%.*]] = load i64, i64* [[X1]], align 8 ; CHECK-NEXT: store i64 [[TMP]], i64* [[TMPCAST]], align 8 ; CHECK-NEXT: [[V:%.*]] = inttoptr i64 [[TMP]] to i8* @@ -1200,8 +1200,8 @@ ; CHECK-NEXT: [[Y:%.*]] = alloca i8*, align 8 ; CHECK-NEXT: [[TMPCAST:%.*]] = bitcast i8** [[Y]] to i64* ; CHECK-NEXT: [[X1:%.*]] = bitcast i8** [[X]] to i64* -; CHECK-NEXT: call void @scribble_on_i64(i64* [[X1]]) -; CHECK-NEXT: call void @scribble_on_i64(i64* [[TMPCAST]]) +; CHECK-NEXT: call void @scribble_on_i64(i64* nonnull [[X1]]) +; CHECK-NEXT: call void @scribble_on_i64(i64* nonnull [[TMPCAST]]) ; CHECK-NEXT: [[TMP:%.*]] = load i8*, i8** [[X]], align 8 ; CHECK-NEXT: store i8* [[TMP]], i8** [[Y]], align 8 ; CHECK-NEXT: [[V:%.*]] = ptrtoint i8* [[TMP]] to i64 @@ -1230,7 +1230,7 @@ ; CHECK-NEXT: [[X1_SUB:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[X1]], i64 0, i64 0 ; CHECK-NEXT: [[X2:%.*]] = bitcast [2 x i8*]* [[X1]] to i128* ; CHECK-NEXT: [[Y1:%.*]] = bitcast i128* [[Y]] to i8** -; CHECK-NEXT: call void @scribble_on_i128(i128* [[X2]]) +; CHECK-NEXT: call void @scribble_on_i128(i128* nonnull [[X2]]) ; CHECK-NEXT: call void @scribble_on_i128(i128* nonnull [[Y]]) ; CHECK-NEXT: [[TMP:%.*]] = load i128, i128* [[X2]], align 8 ; CHECK-NEXT: store i128 [[TMP]], i128* [[Y]], align 8 @@ -1263,7 +1263,7 @@ ; CHECK-NEXT: [[X1_SUB:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[X1]], i64 0, i64 0 ; CHECK-NEXT: [[X2:%.*]] = bitcast [2 x i8*]* [[X1]] to i128* ; CHECK-NEXT: [[Y1:%.*]] = bitcast i128* [[Y]] to i8** -; CHECK-NEXT: call void @scribble_on_i128(i128* [[X2]]) +; CHECK-NEXT: call void @scribble_on_i128(i128* nonnull [[X2]]) ; CHECK-NEXT: call void @scribble_on_i128(i128* nonnull [[Y]]) ; CHECK-NEXT: [[TMP:%.*]] = load i8*, i8** [[X1_SUB]], align 8 ; CHECK-NEXT: store i8* [[TMP]], i8** [[Y1]], align 8 Index: test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -47,7 +47,7 @@ ; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer ; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> undef) ; AVX512-NEXT: [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64> ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef) @@ -62,7 +62,7 @@ ; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> undef) ; AVX512-NEXT: [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64> ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef) @@ -77,7 +77,7 @@ ; AVX512-NEXT: [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer ; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> undef) ; AVX512-NEXT: [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64> ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef) @@ -117,7 +117,7 @@ ; FVW2-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer ; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] ; FVW2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef) ; FVW2-NEXT: [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64> ; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef) @@ -132,7 +132,7 @@ ; FVW2-NEXT: [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer ; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]] ; FVW2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef) ; FVW2-NEXT: [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64> ; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef) @@ -147,7 +147,7 @@ ; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]] ; FVW2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef) ; FVW2-NEXT: [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64> ; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef) Index: test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -52,7 +52,7 @@ ; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] ; AVX1-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 +; AVX1-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 ; AVX1-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] ; AVX1-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* @@ -129,13 +129,13 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 8 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16 ; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 24 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] ; AVX2-NEXT: [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] @@ -171,16 +171,16 @@ ; AVX2-NEXT: [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 8 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 24 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] ; AVX2-NEXT: [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] ; AVX2-NEXT: [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] @@ -298,13 +298,13 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 32 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 48 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] ; AVX512-NEXT: [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] @@ -340,16 +340,16 @@ ; AVX512-NEXT: [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] ; AVX512-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] ; AVX512-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] @@ -1012,13 +1012,13 @@ ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 ; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8 ; AVX1-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 ; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16 ; AVX1-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24 ; AVX1-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 ; AVX1-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX1-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> ; AVX1-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> @@ -1118,13 +1118,13 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8 ; AVX2-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24 ; AVX2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX2-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> ; AVX2-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> @@ -1252,13 +1252,13 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 32 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 48 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> ; AVX512-NEXT: [[TMP23:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float> ; AVX512-NEXT: [[TMP24:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float> @@ -1457,13 +1457,13 @@ ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 ; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4 ; AVX1-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 ; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8 ; AVX1-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12 ; AVX1-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 ; AVX1-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> ; AVX1-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> ; AVX1-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> @@ -1560,13 +1560,13 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 ; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4 ; AVX2-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12 ; AVX2-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 ; AVX2-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> ; AVX2-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> ; AVX2-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> @@ -1691,13 +1691,13 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 16 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 24 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> ; AVX512-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double> ; AVX512-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double> @@ -2453,22 +2453,22 @@ ; AVX1-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3 ; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 ; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4 ; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3 ; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 ; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8 ; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3 ; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 ; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12 ; AVX1-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3 ; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 ; AVX1-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], ; AVX1-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], ; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], @@ -2571,22 +2571,22 @@ ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3 ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4 ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3 ; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8 ; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3 ; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 ; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3 ; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 ; AVX2-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], ; AVX2-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], ; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], @@ -2715,22 +2715,22 @@ ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -7 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8 ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -7 ; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -16 ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -7 ; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -24 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -7 ; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 ; AVX512-NEXT: [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], ; AVX512-NEXT: [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], ; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], @@ -2910,13 +2910,13 @@ ; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef) ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4 ; AVX-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef) +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef) ; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 ; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef) +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef) ; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12 ; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef) +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef) ; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer ; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer @@ -3003,13 +3003,13 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef) ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef) ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef) ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef) ; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer @@ -3163,13 +3163,13 @@ ; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef) ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4 ; AVX-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef) +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef) ; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 ; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef) +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef) ; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12 ; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef) +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef) ; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer ; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer @@ -3256,13 +3256,13 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer Index: test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll =================================================================== --- test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll +++ test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll @@ -16,7 +16,7 @@ ; CHECK-LABEL: @foo( ; CHECK: %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[sret_cast]], i8 0, i64 64 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[sret_cast]], i8 0, i64 64 ; CHECK-NOT: call void @llvm.memcpy ; CHECK: ret void } @@ -38,12 +38,12 @@ ; CHECK-LABEL: @bar( ; CHECK: %[[a:[^=]+]] = alloca [8 x i64] ; CHECK: %[[a_cast:[^=]+]] = bitcast [8 x i64]* %[[a]] to i8* -; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 %[[a_cast]], i8 0, i64 64 +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[a_cast]], i8 0, i64 64 ; CHECK: %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8* -; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 %[[sret_cast]], i8 0, i64 64 -; CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 %[[a_cast]], i8 42, i64 32 +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[sret_cast]], i8 0, i64 64 +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %[[a_cast]], i8 42, i64 32 ; CHECK: %[[out_cast:[^=]+]] = bitcast [8 x i64]* %out to i8* -; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[out_cast]], i8* align 8 %[[a_cast]], i64 64 +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %[[out_cast]], i8* nonnull align 8 %[[a_cast]], i64 64 ; CHECK-NOT: call void @llvm.memcpy ; CHECK: ret void }