diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -511,9 +511,9 @@ /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive. Uses the VPValue operands from \p Operands instead of \p + /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p /// Instr's operands. - void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands, + void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State); @@ -3040,8 +3040,19 @@ } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, - VPUser &User, +// Utility that clears Instruction flags that may no longer hold after +// linearizing the control flow as a result of vectorization. +static void clearUnsafeFlagsAfterLinearizingCF(Instruction *NewInst) { + if (isa(NewInst)) { + NewInst->setHasNoUnsignedWrap(false); + NewInst->setHasNoSignedWrap(false); + } + if (isa(NewInst)) + NewInst->setIsExact(false); +} + +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + VPReplicateRecipe *RepRecipe, const VPIteration &Instance, bool IfPredicateInstr, VPTransformState &State) { @@ -3062,17 +3073,26 @@ if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); + // If the scalarized instruction was in a basic block that needed predication + // and it's not predicated after vectorization, we can't propagate NUW/NSW + // flags. The control flow has been linearized and the instruction is no + // longer guarded by the predicate, which could make NUW/NSW properties to no + // longer hold. + if (!RepRecipe->isPredicated() && !State.Instance && State.VF.isVector() && + Legal->blockNeedsPredication(Instr->getParent())) + clearUnsafeFlagsAfterLinearizingCF(Cloned); + State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) { + for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { auto *Operand = dyn_cast(Instr->getOperand(op)); auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(User.getOperand(op), InputInstance); + auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } addNewMetadata(Cloned, Instr); @@ -3080,7 +3100,7 @@ // Place the cloned scalar in the new loop. Builder.Insert(Cloned); - State.set(Def, Cloned, Instance); + State.set(RepRecipe, Cloned, Instance); // If we just cloned a new assumption, add it the assumption cache. if (auto *II = dyn_cast(Cloned)) @@ -4901,9 +4921,17 @@ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); - if (auto *VecOp = dyn_cast(V)) + if (auto *VecOp = dyn_cast(V)) { VecOp->copyIRFlags(&I); + // If the instruction was in a basic block that needed predication, we + // can't propagate NUW/NSW flags. The control flow has been linearized + // and the instruction is no longer guarded by the predicate, which + // could make NUW/NSW properties to no longer hold. + if (Legal->blockNeedsPredication(I.getParent())) + clearUnsafeFlagsAfterLinearizingCF(VecOp); + } + // Use this vector value for all users of the original instruction. State.set(Def, V, Part); addMetadata(V, &I); @@ -9809,8 +9837,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, - *State.Instance, IsPredicated, State); + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance, + IsPredicated, State); // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. @@ -9833,7 +9861,7 @@ "Can't scalarize a scalable vector"); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this, + State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, VPIteration(Part, Lane), IsPredicated, State); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -86,7 +86,7 @@ ; CHECK-NEXT: %[[ICMP:.*]] = icmp ne %[[LOAD]], shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer) ; CHECK: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %[[BITCAST:.*]], i32 4, %[[ICMP]], poison) ; CHECK-NEXT: %[[MASKED_GATHER:.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %[[SPLAT]], i32 4, %[[ICMP]], undef) -; CHECK-NEXT: %[[ADD:.*]] = add nsw %[[MASKED_GATHER]], %[[MASKED_LOAD]] +; CHECK-NEXT: %[[ADD:.*]] = add %[[MASKED_GATHER]], %[[MASKED_LOAD]] ; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[ADD]], * %[[BITCAST1:.*]], i32 4, %[[ICMP]]) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -54,7 +54,7 @@ ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !3 -; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP8:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* @@ -154,10 +154,10 @@ ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !3 -; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] +; AVX2-NEXT: [[TMP32:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP33:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP34:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX2-NEXT: [[TMP35:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] @@ -271,10 +271,10 @@ ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !3 -; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] +; AVX512-NEXT: [[TMP32:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP33:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX512-NEXT: [[TMP34:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP35:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] @@ -314,7 +314,7 @@ ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison) -; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]] +; AVX512-NEXT: [[TMP57:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]] ; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP49]] ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[TMP58]], i32 0 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>* @@ -416,7 +416,7 @@ ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !14 -; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP8:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* @@ -516,10 +516,10 @@ ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !14 -; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] +; AVX2-NEXT: [[TMP32:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP33:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP34:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX2-NEXT: [[TMP35:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] @@ -633,10 +633,10 @@ ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), !alias.scope !16 -; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] +; AVX512-NEXT: [[TMP32:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP33:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX512-NEXT: [[TMP34:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP35:%.*]] = add <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] @@ -676,7 +676,7 @@ ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP54]], i32 0 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32 addrspace(1)* [[TMP55]] to <8 x i32> addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison) -; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]] +; AVX512-NEXT: [[TMP57:%.*]] = add <8 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]] ; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP49]] ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP58]], i32 0 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)* @@ -1471,7 +1471,7 @@ ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !45 ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], -; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP2:%.*]] = shl <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !48 ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr52111.ll b/llvm/test/Transforms/LoopVectorize/X86/pr52111.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr52111.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr52111.ll @@ -14,7 +14,7 @@ ; CHECK: %[[lane0Idx:.*]] = add i64 %index, 0 ; CHECK: %[[cmp:.*]] = icmp eq <4 x i64> %[[vecInd]], zeroinitializer ; We shouldn't have NUW/NSW flags in the following add instruction. -; CHECK: %[[sub:.*]] = sub nuw nsw i64 %[[lane0Idx]], 1 +; CHECK: %[[sub:.*]] = sub i64 %[[lane0Idx]], 1 ; CHECK: %[[g0:.*]] = getelementptr inbounds float, float* %{{.*}}, i64 %[[sub]] ; CHECK: %[[mask:.*]] = xor <4 x i1> %[[cmp]], ; CHECK: %[[g1:.*]] = getelementptr inbounds float, float* %[[g0]], i32 0 @@ -53,8 +53,8 @@ ; CHECK: %[[vecInd:.*]] = phi <4 x i64> [ , %{{.*}} ] ; CHECK: %[[cmp:.*]] = icmp eq <4 x i64> %[[vecInd]], zeroinitializer ; We shouldn't have NUW/NSW flags in the following sub/mul instructions. -; CHECK: %[[sub:.*]] = sub nuw nsw <4 x i64> %[[vecInd]], -; CHECK: %[[mul:.*]] = mul nuw nsw <4 x i64> %[[sub]], +; CHECK: %[[sub:.*]] = sub <4 x i64> %[[vecInd]], +; CHECK: %[[mul:.*]] = mul <4 x i64> %[[sub]], ; CHECK: %[[g0:.*]] = getelementptr inbounds float, float* %{{.*}}, <4 x i64> %[[mul]] ; CHECK: %[[mask:.*]] = xor <4 x i1> %[[cmp]], ; CHECK: call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %[[g0]], i32 4, <4 x i1> %[[mask]], <4 x float> undef) @@ -96,7 +96,7 @@ ; CHECK: %[[cmp1:.*]] = icmp eq <4 x i64> %[[and0]], zeroinitializer ; CHECK: %[[and1:.*]] = and <4 x i1> %[[cmp0]], %[[cmp1]] ; We shouldn't have the 'exact' flag in the following div instruction. -; CHECK: %[[div:.*]] = sdiv exact <4 x i64> %[[vecInd]], +; CHECK: %[[div:.*]] = sdiv <4 x i64> %[[vecInd]], ; CHECK: %[[g0:.*]] = getelementptr inbounds float, float* %{{.*}}, <4 x i64> %[[div]] ; CHECK: %[[mask:.*]] = xor <4 x i1> %[[and1]], ; CHECK: call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %[[g0]], i32 4, <4 x i1> %[[mask]], <4 x float> undef), diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -40,7 +40,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -141,7 +141,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> @@ -216,7 +216,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -317,7 +317,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> @@ -407,7 +407,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] @@ -518,7 +518,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* @@ -614,7 +614,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] @@ -725,7 +725,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nsw i32 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <24 x i8>* @@ -1106,7 +1106,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -1432,7 +1432,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> @@ -1517,7 +1517,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE65:%.*]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE65]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = shl <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP7]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -1866,7 +1866,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE65:%.*]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE65]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = shl <8 x i32> [[VEC_IND]], ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP7]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; ENABLED_MASKED_STRIDED: pred.load.if: @@ -2284,7 +2284,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE62]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] @@ -2619,7 +2619,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -329,7 +329,7 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* ; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer -; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw <4 x i64> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul <4 x i64> [[VEC_IND]], ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.store.if: @@ -382,7 +382,7 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul i64 [[INDEX]], 3 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <12 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <12 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll @@ -1,4 +1,4 @@ -; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" @@ -23,10 +23,10 @@ ;CHECK-LABEL: @masked_strided( ;CHECK: vector.body: -;CHECK-NEXT: %index = phi i32 +;CHECK-NEXT: %index = phi i32 ;CHECK-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ ;CHECK-NEXT: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;CHECK-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;CHECK-NEXT: %{{.*}} = shl <8 x i32> %[[VECIND]], ;CHECK-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 ;CHECK-NEXT: br i1 %[[M]], label %pred.store.if, label %pred.store.continue ;CHECK-NOT: %{{.+}} = load <16 x i8>, <16 x i8>* %{{.*}}, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll --- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll @@ -36,9 +36,9 @@ ; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[WideAVal]], %[[InnerForPh]] ], [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ] ; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]] ; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> , <4 x i32> undef) -; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]] -; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]] -; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], +; CHECK: %[[Add1:.*]] = add <4 x i32> %[[WideBVal]], %[[VecIndTr]] +; CHECK: %[[AccumPhiNext]] = add <4 x i32> %[[Add1]], %[[AccumPhi]] +; CHECK: %[[InnerIndNext]] = add <4 x i64> %[[InnerInd]], ; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}} ; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0 ; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]]