diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -511,9 +511,9 @@ /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, + void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); @@ -3022,8 +3022,49 @@ } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, - VPUser &User, +// Returns true if `Def` is part of the address computation of a recipe +// generating a widen memory instruction. Returns false otherwise. +static bool isAddressComputationOfWidenMemInstr(VPValue *Def) { + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(Def); + + // Traverse def-use chain from Def searching for a + // VPWidenMemoryInstructionRecipe or VPInterleaveRecipe. + while (!Worklist.empty()) { + VPValue *CurDef = Worklist.back(); + Worklist.pop_back(); + + if (!Visited.insert(CurDef).second) + continue; + + // Address computation should only happen on integer types. + // TODO: Retrieve the type from the VPRecipes/VPInstructions when available. + Value *UnderlyingVal = CurDef->getUnderlyingValue(); + if (!UnderlyingVal || (!UnderlyingVal->getType()->isIntOrIntVectorTy() && + !UnderlyingVal->getType()->isPtrOrPtrVectorTy())) + return false; + + for (VPUser *User : CurDef->users()) { + if (auto *RecBase = dyn_cast(User)) { + auto *WidenMemRec = dyn_cast(RecBase); + auto *InterleaveRec = dyn_cast(RecBase); + // Make sure the definition is not the value to store. + if ((WidenMemRec && WidenMemRec->getAddr() == CurDef) || + (InterleaveRec && InterleaveRec->getAddr() == CurDef)) + return true; + + for (auto *SingleDef : RecBase->definedValues()) + Worklist.push_back(SingleDef); + } + } + } + + return false; +} + +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State) { @@ -3044,17 +3085,29 @@ if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); + // If the scalarized instruction contributes to the address computation of a + // widen masked load/store, was in a basic block that needed predication + // and is not predicated after vectorization, we can't propagate + // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The control flow + // has been linearized and the instruction is no longer guarded by the + // predicate, which could lead to the generation of a poison value feeding the + // base address of the widen masked load/store. + if (!RepRecipe->isPredicated() && !State.Instance && State.VF.isVector() && + Legal->blockNeedsPredication(Instr->getParent()) && + isAddressComputationOfWidenMemInstr(RepRecipe)) + Cloned->dropPoisonGeneratingFlags(); + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { auto *Operand = dyn_cast(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(User.getOperand(op), InputInstance); + auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -3062,7 +3115,7 @@ // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - State.set(Def, Cloned, Instance); + State.set(RepRecipe, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) @@ -9798,8 +9851,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - *State.Instance, IsPredicated, State); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. @@ -9822,7 +9875,7 @@ "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, VPIteration(Part, Lane), IsPredicated, State); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll @@ -5,7 +5,7 @@ ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load , * ; CHECK-NEXT: %[[MASK:.*]] = fcmp ogt %[[LOAD1]], -; CHECK-NEXT: %[[GEPA:.*]] = getelementptr inbounds float, float* %a, +; CHECK-NEXT: %[[GEPA:.*]] = getelementptr float, float* %a, ; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to * ; CHECK-NEXT: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32(* %[[MLOAD_PTRS]], i32 4, %[[MASK]] ; CHECK-NEXT: %[[FADD:.*]] = fadd %[[LOAD1]], %[[LOAD2]] @@ -42,7 +42,7 @@ ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load , * ; CHECK-NEXT: %[[MASK:.*]] = icmp ne %[[LOAD1]], -; CHECK-NEXT: %[[GEPA:.*]] = getelementptr inbounds i32, i32* %a, +; CHECK-NEXT: %[[GEPA:.*]] = getelementptr i32, i32* %a, ; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to * ; CHECK-NEXT: %[[LOAD2:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %[[MLOAD_PTRS]], i32 4, %[[MASK]] ; CHECK-NEXT: %[[FADD:.*]] = add %[[LOAD1]], %[[LOAD2]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll @@ -1,4 +1,4 @@ -; This is the loop in c++ being vectorize in this file with +; This is the loop in c++ being vectorize in this file with ; experimental.vector.reverse ;#pragma clang loop vectorize_width(4, scalable) @@ -18,7 +18,7 @@ define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 { ; CHECK-LABEL: vector.body: ; CHECK: %[[REVERSE6:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) -; CHECK: %[[WIDEMSKLOAD:.*]] = call @llvm.masked.load.nxv4f64.p0nxv4f64(* nonnull %{{.*}}, i32 8, %[[REVERSE6]], poison) +; CHECK: %[[WIDEMSKLOAD:.*]] = call @llvm.masked.load.nxv4f64.p0nxv4f64(* %{{.*}}, i32 8, %[[REVERSE6]], poison) ; CHECK-NEXT: %[[FADD:.*]] = fadd %[[WIDEMSKLOAD]] ; CHECK: %[[REVERSE9:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) ; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64( %[[FADD]], * %{{.*}}, i32 8, %[[REVERSE9]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -51,16 +51,16 @@ ; CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_LOAD6]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = fcmp une <4 x double> [[REVERSE7]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[TMP10]], i64 -3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, double* [[TMP10]], i64 -3 ; CHECK-NEXT: [[REVERSE8:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP12]], i32 8, <4 x i1> [[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[TMP10]], i64 -4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 -3 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP12]], i32 8, <4 x i1> [[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP10]], i64 -4 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, double* [[TMP13]], i64 -3 ; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP15]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[TMP16:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], ; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], ; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP11]] to <4 x double>* diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -26,10 +26,10 @@ ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , {{.*}} ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0 entry: @@ -151,10 +151,10 @@ ; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = sdiv exact i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP12]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0 entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -30,14 +30,14 @@ ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 ; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> poison) ; AVX512-NEXT: [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64> ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef) ; AVX512-NEXT: [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX6]] ; AVX512-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]]) ; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16 @@ -45,14 +45,14 @@ ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 ; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> poison) ; AVX512-NEXT: [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64> ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef) ; AVX512-NEXT: [[TMP17:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_1]], -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]]) ; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32 @@ -60,14 +60,14 @@ ; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4 ; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> poison) ; AVX512-NEXT: [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64> ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef) ; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_2]], -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]]) ; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48 @@ -75,14 +75,14 @@ ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4 ; AVX512-NEXT: [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer -; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]] +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> poison) ; AVX512-NEXT: [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64> ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef) ; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_3]], -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr float, float* [[OUT]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64 @@ -112,18 +112,18 @@ ; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD7]], zeroinitializer ; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer ; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer -; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] ; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> poison) -; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2 +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 2 ; FVW2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> poison) -; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4 +; FVW2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> poison) +; FVW2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 4 ; FVW2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> poison) -; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 6 +; FVW2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> poison) +; FVW2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 6 ; FVW2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> poison) +; FVW2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> poison) ; FVW2-NEXT: [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> ; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD10]] to <2 x i64> ; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD11]] to <2 x i64> @@ -140,16 +140,16 @@ ; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], ; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], ; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], -; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[INDEX6]] ; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]]) -; FVW2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 2 +; FVW2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP32]], i64 2 ; FVW2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]]) -; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 4 +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP32]], i64 4 ; FVW2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]]) -; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 6 +; FVW2-NEXT: [[TMP38:%.*]] = getelementptr float, float* [[TMP32]], i64 6 ; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX6]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -261,7 +261,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP3]], align 4, !alias.scope !17, !noalias !20 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[C]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP5]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope !23 ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !24, !noalias !23 @@ -294,7 +294,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD28]], [[BROADCAST_SPLAT30]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT32]], <8 x i32>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX25]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 [[INDEX25]] ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD33:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x i32> poison) ; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD33]], <8 x i32*> [[BROADCAST_SPLAT35]], i32 4, <8 x i1> [[TMP9]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -61,20 +61,20 @@ ; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT8]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp slt <4 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT10]] ; CHECK-NEXT: [[TMP19:%.*]] = icmp slt <4 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT12]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 4 ; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 12 ; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4 ; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[TMP16]], @@ -228,20 +228,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], @@ -918,20 +918,20 @@ ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0 ; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP70]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4 +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4 ; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8 ; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12 ; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison) ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], @@ -1091,20 +1091,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], @@ -1609,20 +1609,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], @@ -1776,20 +1776,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], @@ -1943,20 +1943,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], @@ -2119,20 +2119,20 @@ ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 [[TMP60]], i32 3 -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0 +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0 ; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP70]], align 4 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4 +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4 ; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP72]], align 4 -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8 +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8 ; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12 ; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP76]], align 4 ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], @@ -2293,20 +2293,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], @@ -2461,20 +2461,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], @@ -2639,20 +2639,20 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -49,13 +49,13 @@ ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !3 ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -136,40 +136,40 @@ ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], ; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !3 -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !3 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !3 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 24 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !3 ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 ; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 8 +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 16 +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !5, !noalias !7 -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 24 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -252,40 +252,40 @@ ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !3 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !3 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 32 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !3 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !3 ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 ; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 16 +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 32 +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 48 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 @@ -307,13 +307,13 @@ ; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP52]], align 4 ; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0 +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP49]] +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP54]], i32 0 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison) ; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]] -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[TMP58]], i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP49]] +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP58]], i32 0 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>* ; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]]) ; AVX512-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 8 @@ -408,13 +408,13 @@ ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !14 ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP9]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -495,40 +495,40 @@ ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], ; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 0 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !14 -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 8 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !14 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 16 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !14 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 24 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !14 ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 0 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 8 +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 16 +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 24 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -611,40 +611,40 @@ ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 0 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison), !alias.scope !16 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 16 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), !alias.scope !16 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 32 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), !alias.scope !16 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !16 ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 0 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !18, !noalias !20 -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 16 +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !18, !noalias !20 -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 32 +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !18, !noalias !20 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 48 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !18, !noalias !20 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 @@ -666,13 +666,13 @@ ; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32 addrspace(1)* [[TMP51]] to <8 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP52]], align 4 ; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP54]], i32 0 +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP49]] +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP54]], i32 0 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32 addrspace(1)* [[TMP55]] to <8 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison) ; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]] -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP58]], i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP49]] +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP58]], i32 0 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP57]], <8 x i32> addrspace(1)* [[TMP60]], i32 4, <8 x i1> [[TMP53]]) ; AVX512-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 8 @@ -776,14 +776,14 @@ ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison), !alias.scope !24 ; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -865,20 +865,20 @@ ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], ; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison), !alias.scope !24 -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 8 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison), !alias.scope !24 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 16 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison), !alias.scope !24 -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 24 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison), !alias.scope !24 ; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> @@ -889,20 +889,20 @@ ; AVX2-NEXT: [[TMP37:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]] ; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]] ; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 0 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 8 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 8 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 16 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 16 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !26, !noalias !28 -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 24 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 24 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>* ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -986,20 +986,20 @@ ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison), !alias.scope !27 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 16 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison), !alias.scope !27 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 32 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison), !alias.scope !27 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison), !alias.scope !27 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> @@ -1010,20 +1010,20 @@ ; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]] ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]] ; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 0 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !29, !noalias !31 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 16 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !29, !noalias !31 -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 32 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !29, !noalias !31 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 48 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !29, !noalias !31 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 @@ -1045,14 +1045,14 @@ ; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP56]], align 4 ; AVX512-NEXT: [[TMP57:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP53]] -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP58]], i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr float, float* [[B]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr float, float* [[TMP58]], i32 0 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast float* [[TMP59]] to <8 x float>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP60]], i32 4, <8 x i1> [[TMP57]], <8 x float> poison) ; AVX512-NEXT: [[TMP61:%.*]] = sitofp <8 x i32> [[WIDE_LOAD21]] to <8 x float> ; AVX512-NEXT: [[TMP62:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD22]], [[TMP61]] -; AVX512-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP53]] -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 0 +; AVX512-NEXT: [[TMP63:%.*]] = getelementptr float, float* [[A]], i64 [[TMP53]] +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr float, float* [[TMP63]], i32 0 ; AVX512-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <8 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP62]], <8 x float>* [[TMP65]], i32 4, <8 x i1> [[TMP57]]) ; AVX512-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 8 @@ -1176,20 +1176,20 @@ ; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], ; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], ; AVX-NEXT: [[TMP19:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD14]], -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]] -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]] -; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP3]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]] +; AVX-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]] +; AVX-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]] +; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] +; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 ; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* ; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !34 -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 4 +; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4 ; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* ; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !34 -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8 +; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8 ; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* ; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !34 -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 12 +; AVX-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12 ; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* ; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !34 ; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> @@ -1200,20 +1200,20 @@ ; AVX-NEXT: [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]] ; AVX-NEXT: [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]] ; AVX-NEXT: [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] -; AVX-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; AVX-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]] -; AVX-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP3]] -; AVX-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 0 +; AVX-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]] +; AVX-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]] +; AVX-NEXT: [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]] +; AVX-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] +; AVX-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 ; AVX-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !36, !noalias !38 -; AVX-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 4 +; AVX-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4 ; AVX-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !36, !noalias !38 -; AVX-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 8 +; AVX-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8 ; AVX-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !36, !noalias !38 -; AVX-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 12 +; AVX-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12 ; AVX-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>* ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -1295,20 +1295,20 @@ ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !38 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 8 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !38 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 16 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !38 -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 24 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 24 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !38 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> @@ -1319,20 +1319,20 @@ ; AVX512-NEXT: [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]] ; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]] ; AVX512-NEXT: [[TMP39:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 0 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !40, !noalias !42 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 8 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 8 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !40, !noalias !42 -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 16 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !40, !noalias !42 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 24 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 24 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !40, !noalias !42 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -1690,30 +1690,30 @@ ; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer ; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE15]], zeroinitializer ; AVX2-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE17]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 0 -; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -3 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0 +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -3 ; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !44 ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -4 -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -3 +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -4 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3 ; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !44 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8 -; AVX2-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -3 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3 ; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !44 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -12 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -3 +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -12 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3 ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !44 @@ -1722,28 +1722,28 @@ ; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], ; AVX2-NEXT: [[TMP42:%.*]] = fadd <4 x double> [[REVERSE25]], ; AVX2-NEXT: [[TMP43:%.*]] = fadd <4 x double> [[REVERSE28]], -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX2-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -3 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3 ; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -4 -; AVX2-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -3 +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -4 +; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3 ; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8 -; AVX2-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -3 +; AVX2-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 +; AVX2-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3 ; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -12 -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -3 +; AVX2-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -12 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -1833,30 +1833,30 @@ ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE15]], zeroinitializer ; AVX512-NEXT: [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE17]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 0 -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -7 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -7 ; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !58 ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -7 +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -7 ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !58 ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -16 -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -7 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -16 +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -7 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !58 ; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -24 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -7 +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -7 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !58 @@ -1865,28 +1865,28 @@ ; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], ; AVX512-NEXT: [[TMP42:%.*]] = fadd <8 x double> [[REVERSE25]], ; AVX512-NEXT: [[TMP43:%.*]] = fadd <8 x double> [[REVERSE28]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX512-NEXT: [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -7 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -7 ; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !60, !noalias !62 ; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8 -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -7 +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -7 ; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !60, !noalias !62 ; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -16 -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -7 +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -16 +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -7 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !60, !noalias !62 ; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> poison, <8 x i32> -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -24 -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -7 +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -24 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -7 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !60, !noalias !62 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -1996,34 +1996,34 @@ ; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer ; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]] ; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], ; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], ; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], ; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0 ; AVX1-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12 +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12 ; AVX1-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison) ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], ; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], @@ -2032,16 +2032,16 @@ ; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 +; AVX1-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 +; AVX1-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 ; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 +; AVX1-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 ; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 +; AVX1-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 ; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -2120,34 +2120,34 @@ ; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer ; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], ; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], ; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], ; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0 ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison) -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison) -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison) -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison) ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], ; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], @@ -2156,16 +2156,16 @@ ; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 ; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 ; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -2244,34 +2244,34 @@ ; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer ; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], ; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], ; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], ; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 0 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0 ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x double*> poison) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x double*> poison) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 16 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x double*> poison) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <8 x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x double*> poison) ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], ; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], ; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], @@ -2280,16 +2280,16 @@ ; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]]) -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 16 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16 ; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]]) -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 24 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24 ; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -2413,34 +2413,34 @@ ; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer ; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] ; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], ; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], ; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], ; AVX1-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0 ; AVX1-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison) -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison) -; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison) -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12 +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12 ; AVX1-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison) ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], ; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], @@ -2449,16 +2449,16 @@ ; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 +; AVX1-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 +; AVX1-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 ; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 +; AVX1-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 ; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 +; AVX1-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 ; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -2537,34 +2537,34 @@ ; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer ; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], ; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], ; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], ; AVX2-NEXT: [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0 ; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison) -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison) -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison) -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison) ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], ; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], @@ -2573,16 +2573,16 @@ ; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer -; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]]) -; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 4 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]]) -; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8 ; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]]) -; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 12 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12 ; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>* ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -2661,34 +2661,34 @@ ; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer ; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], ; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], ; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], ; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 0 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0 ; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x i32 ()*> poison) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x i32 ()*> poison) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 16 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x i32 ()*> poison) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <8 x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x i32 ()*> poison) ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], ; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], ; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], @@ -2697,16 +2697,16 @@ ; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]]) -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 8 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 16 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16 ; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]]) -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 24 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24 ; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>* ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -121,7 +121,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; DISABLED_MASKED_STRIDED: pred.load.continue14: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = bitcast i8* [[TMP50]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP49]], <8 x i8>* [[TMP51]], i32 1, <8 x i1> [[TMP0]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -141,13 +141,13 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP5]], i32 1, <8 x i1> [[TMP0]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -297,7 +297,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; DISABLED_MASKED_STRIDED: pred.load.continue14: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = bitcast i8* [[TMP50]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP49]], <8 x i8>* [[TMP51]], i32 1, <8 x i1> [[TMP0]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -317,14 +317,14 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -489,7 +489,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; DISABLED_MASKED_STRIDED: pred.load.continue16: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP52]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP51]], <8 x i8>* [[TMP53]], i32 1, <8 x i1> [[TMP3]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 @@ -518,15 +518,15 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 @@ -696,7 +696,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; DISABLED_MASKED_STRIDED: pred.load.continue16: ; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP52]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP51]], <8 x i8>* [[TMP53]], i32 1, <8 x i1> [[TMP3]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 @@ -725,15 +725,15 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nsw i32 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <24 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 @@ -1432,8 +1432,8 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) @@ -1443,8 +1443,8 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 -1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[TMP8]], i32 [[TMP4]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) @@ -2619,8 +2619,8 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> @@ -2631,8 +2631,8 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[Q:%.*]], i32 -1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[TMP10]], i32 [[TMP6]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -382,8 +382,8 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr i16, i16* [[POINTS:%.*]], i64 [[TMP3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <12 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <12 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <12 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll @@ -1,4 +1,4 @@ -; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" @@ -23,7 +23,7 @@ ;CHECK-LABEL: @masked_strided( ;CHECK: vector.body: -;CHECK-NEXT: %index = phi i32 +;CHECK-NEXT: %index = phi i32 ;CHECK-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ ;CHECK-NEXT: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} ;CHECK-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -112,8 +112,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDEX]] to i16 ; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, i16* [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP7]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]