Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2254,6 +2254,10 @@ if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); + bool InBounds = false; + if (auto *gep = dyn_cast(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; Part++) { Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); @@ -2269,6 +2273,8 @@ // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + if (InBounds) + cast(NewPtr)->setIsInBounds(true); // Cast to the vector pointer type. NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); @@ -2405,17 +2411,30 @@ if (isMaskRequired) Mask = *BlockInMask; + bool InBounds = false; + if (auto *gep = dyn_cast( + getLoadStorePointerOperand(Instr)->stripPointerCasts())) + InBounds = gep->isInBounds(); + const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + GetElementPtrInst *PartPtr = nullptr; if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + PartPtr = cast( + Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF))); + PartPtr->setIsInBounds(InBounds); + PartPtr = cast( + Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF))); + PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. Mask[Part] = reverseVector(Mask[Part]); + } else { + PartPtr = cast( + Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF))); + PartPtr->setIsInBounds(InBounds); } return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/pr36032.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/AArch64/pr36032.ll +++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/pr36032.ll @@ -78,11 +78,11 @@ ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, i8* [[TMP22]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0 ; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, i8* [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>* ; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 Index: llvm/trunk/test/Transforms/LoopVectorize/ARM/sphinx.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ llvm/trunk/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -49,18 +49,18 @@ ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, float* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> ; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>* ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double> Index: llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -111,13 +111,13 @@ ; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 8 +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 8 ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP0]], i64 16 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 24 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 24 ; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !0 ; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], @@ -127,13 +127,13 @@ ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 8 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 8 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 16 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16 ; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 24 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 24 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] @@ -143,26 +143,26 @@ ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP20]], <8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i64 8 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP21]], <8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i64 16 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP22]], <8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP24]], i64 24 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP23]], <8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !5, !noalias !7 ; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP33]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP32]], i64 8 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 8 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[TMP32]], i64 16 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP37]], align 4, !alias.scope !0 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[TMP32]], i64 24 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 24 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP39]], align 4, !alias.scope !0 ; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], @@ -172,13 +172,13 @@ ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP44]], i64 8 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 8 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP44]], i64 16 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3 -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP44]], i64 24 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 24 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] @@ -188,13 +188,13 @@ ; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP52]], <8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[TMP56]], i64 8 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 8 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP53]], <8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[TMP56]], i64 16 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16 ; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP54]], <8 x i32>* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP56]], i64 24 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 24 ; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP55]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !5, !noalias !7 ; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64 @@ -280,13 +280,13 @@ ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP0]], i64 32 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 32 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 48 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 48 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !0 ; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], @@ -296,13 +296,13 @@ ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 32 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 32 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 48 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 48 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] @@ -312,26 +312,26 @@ ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP20]], <16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP21]], <16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i64 32 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP22]], <16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP24]], i64 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP23]], <16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP33]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP32]], i64 16 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[TMP32]], i64 32 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 32 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP37]], align 4, !alias.scope !0 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[TMP32]], i64 48 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 48 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP39]], align 4, !alias.scope !0 ; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], @@ -341,13 +341,13 @@ ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP44]], i64 16 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP44]], i64 32 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP44]], i64 48 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] @@ -357,13 +357,13 @@ ; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP52]], <16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[TMP56]], i64 16 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP53]], <16 x i32>* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[TMP56]], i64 32 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 32 ; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP54]], <16 x i32>* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP56]], i64 48 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 48 ; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP55]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128 @@ -584,13 +584,13 @@ ; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 8 +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 8 ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 16 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 24 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 24 ; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11 ; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], @@ -600,13 +600,13 @@ ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 8 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 8 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 16 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16 ; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 24 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 24 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !14 ; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] @@ -616,26 +616,26 @@ ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP20]], <8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 8 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP21]], <8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 16 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP22]], <8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 24 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP23]], <8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 8 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 8 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 16 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 24 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 24 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11 ; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], @@ -645,13 +645,13 @@ ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 8 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 8 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 16 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !14 -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 24 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 24 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !14 ; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] @@ -661,13 +661,13 @@ ; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]] ; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP52]], <8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 8 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 8 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP53]], <8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 16 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16 ; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP54]], <8 x i32> addrspace(1)* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 24 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 24 ; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP55]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64 @@ -753,13 +753,13 @@ ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 32 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 32 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 48 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 48 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], @@ -769,13 +769,13 @@ ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 32 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 32 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 48 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 48 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !14 ; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] @@ -785,26 +785,26 @@ ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP20]], <16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP21]], <16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 32 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP22]], <16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP23]], <16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 16 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 32 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 32 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 48 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 48 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], @@ -814,13 +814,13 @@ ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 16 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 32 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !14 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 48 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !14 ; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] @@ -830,13 +830,13 @@ ; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP52]], <16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 16 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP53]], <16 x i32> addrspace(1)* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 32 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 32 ; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP54]], <16 x i32> addrspace(1)* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !16, !noalias !18 -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 48 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 48 ; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP55]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128 @@ -994,13 +994,13 @@ ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8 ; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24 ; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 ; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], @@ -1010,13 +1010,13 @@ ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 -; AVX1-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 8 +; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8 ; AVX1-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 16 +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16 ; AVX1-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 24 +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24 ; AVX1-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 ; AVX1-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> @@ -1030,13 +1030,13 @@ ; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 8 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8 ; AVX1-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 16 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>* ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 24 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>* ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 @@ -1100,13 +1100,13 @@ ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16 ; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 ; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], @@ -1116,13 +1116,13 @@ ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 8 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8 ; AVX2-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 16 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 24 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24 ; AVX2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> @@ -1136,13 +1136,13 @@ ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 8 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8 ; AVX2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 16 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 24 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 @@ -1234,13 +1234,13 @@ ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 32 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 48 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], @@ -1250,13 +1250,13 @@ ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 16 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 32 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 32 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 48 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 48 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> @@ -1270,13 +1270,13 @@ ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP26]], <16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 16 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16 ; AVX512-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 32 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 32 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP28]], <16 x float>* [[TMP35]], i32 4, <16 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 48 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 48 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP29]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 @@ -1439,13 +1439,13 @@ ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31 -; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 4 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4 ; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 12 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12 ; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 ; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], @@ -1455,13 +1455,13 @@ ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 -; AVX1-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 4 +; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4 ; AVX1-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 8 +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8 ; AVX1-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 12 +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12 ; AVX1-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 ; AVX1-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> @@ -1475,13 +1475,13 @@ ; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 4 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4 ; AVX1-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 8 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 12 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 @@ -1542,13 +1542,13 @@ ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 4 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4 ; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8 ; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 12 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 ; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], @@ -1558,13 +1558,13 @@ ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 4 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4 ; AVX2-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 8 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 12 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12 ; AVX2-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 ; AVX2-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> @@ -1578,13 +1578,13 @@ ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] ; AVX2-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 4 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4 ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 8 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 12 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 @@ -1673,13 +1673,13 @@ ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !31 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31 -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31 ; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], @@ -1689,13 +1689,13 @@ ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 8 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34 -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 16 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 16 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 24 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 24 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> @@ -1709,13 +1709,13 @@ ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 8 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8 ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 16 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 16 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP28]], <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 24 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 24 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP29]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 @@ -2426,22 +2426,22 @@ ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; AVX1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -3 +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3 ; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3 ; AVX1-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12 +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3 ; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> @@ -2450,22 +2450,22 @@ ; AVX1-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer ; AVX1-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer ; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 +; AVX1-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3 ; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4 +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3 ; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8 +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3 ; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12 +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3 ; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 @@ -2474,19 +2474,19 @@ ; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], ; AVX1-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], ; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX1-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3 ; AVX1-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 +; AVX1-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4 +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3 ; AVX1-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX1-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 +; AVX1-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8 +; AVX1-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3 ; AVX1-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 +; AVX1-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12 +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3 ; AVX1-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 @@ -2544,22 +2544,22 @@ ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] ; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -3 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3 ; AVX2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3 ; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3 ; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> @@ -2568,22 +2568,22 @@ ; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer ; AVX2-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3 ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4 +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3 ; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8 +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3 ; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3 ; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 @@ -2592,19 +2592,19 @@ ; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], ; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3 ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX2-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8 +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3 ; AVX2-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 +; AVX2-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12 +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 @@ -2688,22 +2688,22 @@ ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -7 +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -7 ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -7 +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -16 -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -7 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -16 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -7 ; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -24 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -7 +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -24 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -7 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> @@ -2712,22 +2712,22 @@ ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -7 +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -7 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -7 +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -7 ; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -16 -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -7 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -16 +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -7 ; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -24 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -7 +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -24 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -7 ; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 @@ -2736,19 +2736,19 @@ ; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], ; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -7 +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -7 ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -7 +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -7 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -16 -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -7 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -16 +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -7 ; AVX512-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -24 -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -7 +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -24 +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -7 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 @@ -2892,13 +2892,13 @@ ; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +; AVX-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4 ; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8 ; AVX-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 12 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 ; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], @@ -2912,13 +2912,13 @@ ; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr double*, double** [[TMP16]], i64 4 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4 ; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr double*, double** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 ; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr double*, double** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12 ; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef) ; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer @@ -2932,13 +2932,13 @@ ; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] ; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 ; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 @@ -2989,13 +2989,13 @@ ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 24 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 ; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], @@ -3009,13 +3009,13 @@ ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double*, double** [[TMP16]], i64 8 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double*, double** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double*, double** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24 ; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef) ; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer @@ -3029,13 +3029,13 @@ ; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 @@ -3153,13 +3153,13 @@ ; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 -; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +; AVX-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4 ; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 -; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8 ; AVX-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 12 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 ; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], @@ -3173,13 +3173,13 @@ ; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 4 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4 ; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 ; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12 ; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>* ; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef) ; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer @@ -3193,13 +3193,13 @@ ; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] ; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 ; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 @@ -3250,13 +3250,13 @@ ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 24 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 ; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], @@ -3270,13 +3270,13 @@ ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16 ; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24 ; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer @@ -3290,13 +3290,13 @@ ; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 Index: llvm/trunk/test/Transforms/LoopVectorize/X86/metadata-enable.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -2076,12 +2076,12 @@ ; O1VEC2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; O1VEC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; O1VEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; O1VEC2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 +; O1VEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; O1VEC2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 ; O1VEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] ; O1VEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 +; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 ; O1VEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4 ; O1VEC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 @@ -2121,12 +2121,12 @@ ; OzVEC2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; OzVEC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; OzVEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] -; OzVEC2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0 +; OzVEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; OzVEC2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 ; OzVEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] ; OzVEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] -; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 +; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 ; OzVEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4 ; OzVEC2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 Index: llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -49,7 +49,7 @@ ; CHECK: %offset.idx = sub i64 %n, %index ; CHECK-NOT: getelementptr ; CHECK: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 -3 -; CHECK: getelementptr i32, i32* %[[G0]], i64 %offset.idx +; CHECK: getelementptr inbounds i32, i32* %[[G0]], i64 %offset.idx ; CHECK-NOT: getelementptr ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body ; @@ -162,7 +162,7 @@ ; INTER: %offset.idx = sub i64 %n, %index ; INTER-NOT: getelementptr ; INTER: %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0 -; INTER: getelementptr i32, i32* %[[G0]], i64 -6 +; INTER: getelementptr inbounds i32, i32* %[[G0]], i64 -6 ; INTER-NOT: getelementptr ; INTER: br i1 {{.*}}, label %middle.block, label %vector.body ; Index: llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll +++ llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll @@ -54,7 +54,7 @@ ; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]] ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* ; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4 -; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4 +; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 4 ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>* ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 Index: llvm/trunk/test/Transforms/LoopVectorize/induction-step.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/induction-step.ll +++ llvm/trunk/test/Transforms/LoopVectorize/induction-step.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ] ; CHECK: [[TMP8:%.*]] = add i64 %index, 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> %vec.ind, <8 x i32>* [[TMP11]], align 4 ; CHECK: %index.next = add i64 %index, 8 @@ -101,7 +101,7 @@ ; CHECK-NEXT: %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ] ; CHECK: [[TMP6:%.*]] = add i64 %index, 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> %vec.ind, <8 x i32>* [[TMP9]], align 4 ; CHECK: %index.next = add i64 %index, 8 @@ -174,7 +174,7 @@ ; CHECK: [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ] ; CHECK: [[TMP6:%.*]] = add i64 %index, 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[VEC_IND10]], <8 x i32>* [[TMP9]], align 4 ; CHECK-NEXT: %index.next = add i64 %index, 8 Index: llvm/trunk/test/Transforms/LoopVectorize/induction.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/induction.ll +++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: %vec.ind = phi <2 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] ; CHECK: [[TMP3:%.*]] = add i64 %index, 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> %vec.ind, <2 x i32>* [[TMP6]], align 4 ; CHECK: %index.next = add i64 %index, 2 @@ -107,7 +107,7 @@ ; UNROLL: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; UNROLL-NOT: add i64 {{.*}}, 4 ; UNROLL: %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index -; UNROLL: getelementptr i64, i64* %[[g1]], i64 2 +; UNROLL: getelementptr inbounds i64, i64* %[[g1]], i64 2 define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) { entry: @@ -353,7 +353,7 @@ ; UNROLL: %[[I2:.+]] = or i32 %index, 2 ; UNROLL: %[[E0:.+]] = sext i32 %index to i64 ; UNROLL: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]] -; UNROLL: getelementptr i32, i32* %[[G0]], i64 2 +; UNROLL: getelementptr inbounds i32, i32* %[[G0]], i64 2 ; UNROLL: pred.udiv.if: ; UNROLL: udiv i32 {{.*}}, %index ; UNROLL: pred.udiv.if{{[0-9]+}}: @@ -711,7 +711,7 @@ ; CHECK: %offset.idx = add i32 %i, %index ; CHECK: %[[A1:.*]] = add i32 %offset.idx, 0 ; CHECK: %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]] -; CHECK: %[[G3:.*]] = getelementptr i32, i32* %[[G1]], i32 0 +; CHECK: %[[G3:.*]] = getelementptr inbounds i32, i32* %[[G1]], i32 0 ; CHECK: %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>* ; CHECK: store <2 x i32> %vec.ind, <2 x i32>* %[[B1]] ; CHECK: %index.next = add i32 %index, 2 @@ -751,7 +751,7 @@ ; UNROLL: %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]] ; UNROLL: %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>* ; UNROLL: store <2 x i32> %vec.ind, <2 x i32>* %[[B1]] -; UNROLL: %[[G2:.*]] = getelementptr i32, i32* %[[G1]], i64 2 +; UNROLL: %[[G2:.*]] = getelementptr inbounds i32, i32* %[[G1]], i64 2 ; UNROLL: %[[B2:.*]] = bitcast i32* %[[G2]] to <2 x i32>* ; UNROLL: store <2 x i32> %step.add, <2 x i32>* %[[B2]] ; UNROLL: %index.next = add i32 %index, 4 @@ -780,7 +780,7 @@ ; CHECK: [[VEC_IND:%.*]] = phi <2 x i32> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] ; CHECK: [[TMP3:%.*]] = add i64 %index, 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: %index.next = add i64 %index, 2 Index: llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ llvm/trunk/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -245,7 +245,7 @@ ; CHECK-LABEL: @test_reversed_load2_store2( ; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0 -; CHECK: %[[G1:.+]] = getelementptr i32, i32* %[[G0]], i64 -6 +; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6 ; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>* ; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> @@ -255,7 +255,7 @@ ; CHECK: add nsw <4 x i32> ; CHECK: sub nsw <4 x i32> ; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1 -; CHECK: %[[G3:.+]] = getelementptr i32, i32* %[[G2]], i64 -7 +; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7 ; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>* ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> Index: llvm/trunk/test/Transforms/LoopVectorize/pr23997.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/pr23997.ll +++ llvm/trunk/test/Transforms/LoopVectorize/pr23997.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-vectorize -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +; Ensure that the 'inbounds' is preserved on the GEPs that feed the load and store in the loop. +define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrspace(1)* align 8 dereferenceable_or_null(8), i64) #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[PREHEADER:%.*]] +; CHECK: preheader: +; CHECK-NEXT: [[DOT10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP0:%.*]], i64 16 +; CHECK-NEXT: [[DOT11:%.*]] = bitcast i8 addrspace(1)* [[DOT10]] to i8 addrspace(1)* addrspace(1)* +; CHECK-NEXT: [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16 +; CHECK-NEXT: [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)* +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2:%.*]], 1 +; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], 1 +; CHECK-NEXT: [[UMAX1:%.*]] = select i1 [[TMP4]], i64 [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[UMAX1]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 8 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 [[TMP6]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP1]], i64 [[TMP6]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[DOT10]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[DOT12]], [[SCEVGEP]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[UMAX]], -16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP7]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP8]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 4 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP9]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP10]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 8 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP11]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP12]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 12 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP13]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP14]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP15]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP16]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP17]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD6]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP18]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 8 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP19]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD7]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP20]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 12 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP21]] to <4 x i8 addrspace(1)*> addrspace(1)* +; CHECK-NEXT: store <4 x i8 addrspace(1)*> [[WIDE_LOAD8]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP22]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DOT18:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDVARS_IV3]] +; CHECK-NEXT: [[V:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT18]], align 8 +; CHECK-NEXT: [[DOT20:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDVARS_IV3]] +; CHECK-NEXT: store i8 addrspace(1)* [[V]], i8 addrspace(1)* addrspace(1)* [[DOT20]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1 +; CHECK-NEXT: [[DOT21:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT4]], [[TMP2]] +; CHECK-NEXT: br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop !7 +; CHECK: loopexit: +; CHECK-NEXT: ret void +; +entry: + br label %preheader + +preheader: ; preds = %4 + %.10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 16 + %.11 = bitcast i8 addrspace(1)* %.10 to i8 addrspace(1)* addrspace(1)* + %.12 = getelementptr inbounds i8, i8 addrspace(1)* %1, i64 16 + %.13 = bitcast i8 addrspace(1)* %.12 to i8 addrspace(1)* addrspace(1)* + br label %loop + +loop: + %indvars.iv3 = phi i64 [ 0, %preheader ], [ %indvars.iv.next4, %loop ] + %.18 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.13, i64 %indvars.iv3 + %v = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.18, align 8 + %.20 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.11, i64 %indvars.iv3 + store i8 addrspace(1)* %v, i8 addrspace(1)* addrspace(1)* %.20, align 8 + %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1 + %.21 = icmp ult i64 %indvars.iv.next4, %2 + br i1 %.21, label %loop, label %loopexit + +loopexit: + ret void +} + +attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-ibt,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" } + +!0 = !{i32 0, i32 2147483646} +!1 = !{} Index: llvm/trunk/test/Transforms/LoopVectorize/scalar_after_vectorization.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/scalar_after_vectorization.ll +++ llvm/trunk/test/Transforms/LoopVectorize/scalar_after_vectorization.ll @@ -13,7 +13,7 @@ ; CHECK: %[[T4:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T3]] ; CHECK: %[[T5:.+]] = bitcast i32* %[[T4]] to <4 x i32>* ; CHECK: load <4 x i32>, <4 x i32>* %[[T5]], align 4 -; CHECK: %[[T6:.+]] = getelementptr i32, i32* %[[T4]], i64 4 +; CHECK: %[[T6:.+]] = getelementptr inbounds i32, i32* %[[T4]], i64 4 ; CHECK: %[[T7:.+]] = bitcast i32* %[[T6]] to <4 x i32>* ; CHECK: load <4 x i32>, <4 x i32>* %[[T7]], align 4 ; CHECK: br {{.*}}, label %middle.block, label %vector.body @@ -31,10 +31,10 @@ ; NO-IC: %[[T7:.+]] = sub nsw i64 %[[T5]], %x ; NO-IC: %[[T8:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T6]] ; NO-IC: %[[T9:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T7]] -; NO-IC: %[[T10:.+]] = getelementptr i32, i32* %[[T8]], i32 0 +; NO-IC: %[[T10:.+]] = getelementptr inbounds i32, i32* %[[T8]], i32 0 ; NO-IC: %[[T11:.+]] = bitcast i32* %[[T10]] to <4 x i32>* ; NO-IC: load <4 x i32>, <4 x i32>* %[[T11]], align 4 -; NO-IC: %[[T12:.+]] = getelementptr i32, i32* %[[T8]], i32 4 +; NO-IC: %[[T12:.+]] = getelementptr inbounds i32, i32* %[[T8]], i32 4 ; NO-IC: %[[T13:.+]] = bitcast i32* %[[T12]] to <4 x i32>* ; NO-IC: load <4 x i32>, <4 x i32>* %[[T13]], align 4 ; NO-IC: br {{.*}}, label %middle.block, label %vector.body