Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -663,6 +663,12 @@ bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; /// Return true if the target supports masked gather. bool isLegalMaskedGather(Type *DataType, Align Alignment) const; + /// Return true if the target forces scalarizing of llvm.masked.gather + /// intrinsics. + bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const; + /// Return true if the target forces scalarizing of llvm.masked.scatter + /// intrinsics. + bool forceScalarizeMaskedScatter(VectorType *Type, Align Alignment) const; /// Return true if the target supports masked compress store. bool isLegalMaskedCompressStore(Type *DataType) const; @@ -1544,6 +1550,10 @@ virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0; + virtual bool forceScalarizeMaskedGather(VectorType *DataType, + Align Alignment) = 0; + virtual bool forceScalarizeMaskedScatter(VectorType *DataType, + Align Alignment) = 0; virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; virtual bool enableOrderedReductions() = 0; @@ -1947,6 +1957,14 @@ bool isLegalMaskedGather(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedGather(DataType, Alignment); } + bool forceScalarizeMaskedGather(VectorType *DataType, + Align Alignment) override { + return Impl.forceScalarizeMaskedGather(DataType, Alignment); + } + bool forceScalarizeMaskedScatter(VectorType *DataType, + Align Alignment) override { + return Impl.forceScalarizeMaskedScatter(DataType, Alignment); + } bool isLegalMaskedCompressStore(Type *DataType) override { return Impl.isLegalMaskedCompressStore(DataType); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -267,6 +267,15 @@ return false; } + bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) const { + return false; + } + + bool forceScalarizeMaskedScatter(VectorType *DataType, + Align Alignment) const { + return false; + } + bool isLegalMaskedCompressStore(Type *DataType) const { return false; } bool isLegalMaskedExpandLoad(Type *DataType) const { return false; } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -408,6 +408,16 @@ return TTIImpl->isLegalMaskedScatter(DataType, Alignment); } +bool TargetTransformInfo::forceScalarizeMaskedGather(VectorType *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedGather(DataType, Alignment); +} + +bool TargetTransformInfo::forceScalarizeMaskedScatter(VectorType *DataType, + Align Alignment) const { + return TTIImpl->forceScalarizeMaskedScatter(DataType, Alignment); +} + bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const { return TTIImpl->isLegalMaskedCompressStore(DataType); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -189,6 +189,18 @@ return isLegalMaskedLoad(DataTy, Alignment); } + bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { + // For MVE, we have a custom lowering pass that will already have custom + // legalised any gathers that we can lower to MVE intrinsics, and want to + // expand all the rest. The pass runs before the masked intrinsic lowering + // pass. + return true; + } + + bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } + bool isLegalMaskedGather(Type *Ty, Align Alignment); bool isLegalMaskedScatter(Type *Ty, Align Alignment) { Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1116,18 +1116,6 @@ if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) return false; - // This method is called in 2 places: - // - from the vectorizer with a scalar type, in which case we need to get - // this as good as we can with the limited info we have (and rely on the cost - // model for the rest). - // - from the masked intrinsic lowering pass with the actual vector type. - // For MVE, we have a custom lowering pass that will already have custom - // legalised any gathers that we can to MVE intrinsics, and want to expand all - // the rest. The pass runs before the masked intrinsic lowering pass, so if we - // are here, we know we want to expand. - if (isa(Ty)) - return false; - unsigned EltWidth = Ty->getScalarSizeInBits(); return ((EltWidth == 32 && Alignment >= 4) || (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); Index: llvm/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -226,6 +226,10 @@ bool isLegalMaskedStore(Type *DataType, Align Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); + bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment); + bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } bool isLegalMaskedGather(Type *DataType, Align Alignment); bool isLegalMaskedScatter(Type *DataType, Align Alignment); bool isLegalMaskedExpandLoad(Type *DataType); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4995,9 +4995,13 @@ const Instruction *I = nullptr) { if (CostKind != TTI::TCK_RecipThroughput) { if ((Opcode == Instruction::Load && - isLegalMaskedGather(SrcVTy, Align(Alignment))) || + isLegalMaskedGather(SrcVTy, Align(Alignment)) && + !forceScalarizeMaskedGather(cast(SrcVTy), + Align(Alignment))) || (Opcode == Instruction::Store && - isLegalMaskedScatter(SrcVTy, Align(Alignment)))) + isLegalMaskedScatter(SrcVTy, Align(Alignment)) && + !forceScalarizeMaskedScatter(cast(SrcVTy), + Align(Alignment)))) return 1; return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, Alignment, CostKind, I); @@ -5012,9 +5016,13 @@ unsigned AddressSpace = PtrTy->getAddressSpace(); if ((Opcode == Instruction::Load && - !isLegalMaskedGather(SrcVTy, Align(Alignment))) || + (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || + forceScalarizeMaskedGather(cast(SrcVTy), + Align(Alignment)))) || (Opcode == Instruction::Store && - !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) + (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || + forceScalarizeMaskedScatter(cast(SrcVTy), + Align(Alignment))))) return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); @@ -5137,35 +5145,21 @@ return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); } +bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend + // it to 8 elements, but zeroing upper bits of the mask vector will add more + // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: + // Check, maybe the gather/scatter instruction is better in the VariableMask + // case. + unsigned NumElts = cast(VTy)->getNumElements(); + return NumElts == 1 || + (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); +} + bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { if (!supportsGather()) return false; - - // This function is called now in two cases: from the Loop Vectorizer - // and from the Scalarizer. - // When the Loop Vectorizer asks about legality of the feature, - // the vectorization factor is not calculated yet. The Loop Vectorizer - // sends a scalar type and the decision is based on the width of the - // scalar element. - // Later on, the cost model will estimate usage this intrinsic based on - // the vector type. - // The Scalarizer asks again about legality. It sends a vector type. - // In this case we can reject non-power-of-2 vectors. - // We also reject single element vectors as the type legalizer can't - // scalarize it. - if (auto *DataVTy = dyn_cast(DataTy)) { - unsigned NumElts = DataVTy->getNumElements(); - if (NumElts == 1) - return false; - // Gather / Scatter for vector 2 is not profitable on KNL / SKX - // Vector-4 of gather/scatter instruction does not exist on KNL. - // We can extend it to 8 elements, but zeroing upper bits of - // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter - // instruction is better in the VariableMask case. - if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))) - return false; - } Type *ScalarTy = DataTy->getScalarType(); if (ScalarTy->isPointerTy()) return true; Index: llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp =================================================================== --- llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -959,7 +959,8 @@ Type *LoadTy = CI->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, LoadTy->getScalarType()); - if (TTI.isLegalMaskedGather(LoadTy, Alignment)) + if (TTI.isLegalMaskedGather(LoadTy, Alignment) && + !TTI.forceScalarizeMaskedGather(cast(LoadTy), Alignment)) return false; scalarizeMaskedGather(DL, CI, DTU, ModifiedDT); return true; @@ -970,7 +971,9 @@ Type *StoreTy = CI->getArgOperand(0)->getType(); Align Alignment = DL.getValueOrABITypeAlignment(MA, StoreTy->getScalarType()); - if (TTI.isLegalMaskedScatter(StoreTy, Alignment)) + if (TTI.isLegalMaskedScatter(StoreTy, Alignment) && + !TTI.forceScalarizeMaskedScatter(cast(StoreTy), + Alignment)) return false; scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT); return true; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1543,13 +1543,16 @@ /// Returns true if the target machine can represent \p V as a masked gather /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { + bool isLegalGatherOrScatter(Value *V, + ElementCount VF = ElementCount::getFixed(1)) { bool LI = isa(V); bool SI = isa(V); if (!LI && !SI) return false; auto *Ty = getLoadStoreType(V); Align Align = getLoadStoreAlignment(V); + if (VF.isVector()) + Ty = VectorType::get(Ty, VF); return (LI && TTI.isLegalMaskedGather(Ty, Align)) || (SI && TTI.isLegalMaskedScatter(Ty, Align)); } @@ -1564,16 +1567,17 @@ } /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - /// If a non-zero VF has been calculated, we check if I will be scalarized - /// predication for that VF. - bool isScalarWithPredication(Instruction *I) const; + /// predication when vectorizing \p I with vectorization factor \p VF. Such + /// instructions include conditional stores and instructions that may divide + /// by zero. + bool isScalarWithPredication(Instruction *I, ElementCount VF) const; // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. + // \p VF is the vectorization factor that will be used to vectorize \p I. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) { + bool isPredicatedInst(Instruction *I, ElementCount VF, + bool IsKnownUniform = false) { // When we know the load is uniform and the original scalar loop was not // predicated we don't need to mark it as a predicated instruction. Any // vectorised blocks created when tail-folding are something artificial we @@ -1589,7 +1593,7 @@ // instructions. if (isa(I) || isa(I)) return Legal->isMaskRequired(I); - return isScalarWithPredication(I); + return isScalarWithPredication(I, VF); } /// Returns true if \p I is a memory instruction with consecutive memory @@ -1781,7 +1785,7 @@ /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. - bool useEmulatedMaskMemRefHack(Instruction *I); + bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated @@ -4864,7 +4868,8 @@ Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const { +bool LoopVectorizationCostModel::isScalarWithPredication( + Instruction *I, ElementCount VF) const { if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; switch(I->getOpcode()) { @@ -4876,11 +4881,14 @@ return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); + Type *VTy = Ty; + if (VF.isVector()) + VTy = VectorType::get(Ty, VF); const Align Alignment = getLoadStoreAlignment(I); return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || - TTI.isLegalMaskedGather(Ty, Alignment)) + TTI.isLegalMaskedGather(VTy, Alignment)) : !(isLegalMaskedStore(Ty, Ptr, Alignment) || - TTI.isLegalMaskedScatter(Ty, Alignment)); + TTI.isLegalMaskedScatter(VTy, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -4953,7 +4961,7 @@ // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -5004,7 +5012,7 @@ << *I << "\n"); return; } - if (isScalarWithPredication(I)) { + if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -6433,7 +6441,8 @@ return RUs; } -bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, + ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such @@ -6442,8 +6451,7 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && - "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); @@ -6470,13 +6478,13 @@ if (!blockNeedsPredicationForAnyReason(BB)) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) && + if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. @@ -6512,7 +6520,7 @@ // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I)) + if (isScalarWithPredication(I, VF)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6559,7 +6567,7 @@ // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), true, false); @@ -6722,7 +6730,7 @@ // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(I, VF)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6733,7 +6741,7 @@ /*Insert=*/false, /*Extract=*/true); Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); - if (useEmulatedMaskMemRefHack(I)) + if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; @@ -7140,7 +7148,7 @@ // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa(&I) && isScalarWithPredication(&I)) + if (isa(&I) && isScalarWithPredication(&I, VF)) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -7150,7 +7158,7 @@ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract InstructionCost Cost; if (isa(&I) && VF.isScalable() && - isLegalGatherOrScatter(&I)) { + isLegalGatherOrScatter(&I, VF)) { Cost = getGatherScatterCost(&I, VF); setWideningDecision(&I, VF, CM_GatherScatter, Cost); } else { @@ -7192,7 +7200,7 @@ } InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I, VF) ? getGatherScatterCost(&I, VF) * NumAccesses : InstructionCost::getInvalid(); @@ -7395,7 +7403,7 @@ // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF.isVector() && isScalarWithPredication(I)) { + if (VF.isVector() && isScalarWithPredication(I, VF)) { InstructionCost Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -8568,7 +8576,9 @@ VFRange &Range) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); }, + [this, CI](ElementCount VF) { + return CM.isScalarWithPredication(CI, VF); + }, Range); if (IsPredicated) @@ -8608,7 +8618,8 @@ // scalarization is profitable or it is predicated. auto WillScalarize = [this, I](ElementCount VF) -> bool { return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I); + CM.isProfitableToScalarize(I, VF) || + CM.isScalarWithPredication(I, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8682,7 +8693,7 @@ Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); }, + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); }, Range); // Even if the instruction is not marked as uniform, there are certain Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -90,4 +90,4 @@ ret void } -attributes #0 = { "target-features"="+neon,+sve,+v8.1a" } +attributes #0 = { "target-features"="+neon,+sve,+v8.1a" vscale_range(2, 0) } Index: llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -125,7 +125,7 @@ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 } -attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"} +attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve" vscale_range(2,0) } !0 = distinct !{!0, !1, !2, !3, !4, !5} Index: llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 @b = global [8 x i32] zeroinitializer, align 16 @@ -31,6 +31,12 @@ ; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 ; AVX-NEXT: ret void +; +; AVX512-LABEL: @foo( +; AVX512-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> , i32 8, <2 x i1> , <2 x i32> undef) +; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 +; AVX512-NEXT: ret void ; %1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 store i32 %1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16 Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -105,19 +105,23 @@ ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 ; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512F-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 ; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 +; AVX512F-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 +; AVX512F-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX512F-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( @@ -255,35 +259,43 @@ ; ; AVX512F-LABEL: @gather_load_3( ; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512F-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 ; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 +; AVX512F-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 +; AVX512F-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512F-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 +; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 +; AVX512F-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX512F-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 +; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512F-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 +; AVX512F-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; AVX512F-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512F-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 +; AVX512F-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; AVX512F-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 +; AVX512F-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( @@ -457,13 +469,19 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( +; AVX512F-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512F-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512F-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 +; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 ; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 +; AVX512F-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512F-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] @@ -473,20 +491,22 @@ ; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 +; AVX512F-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 +; AVX512F-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 +; AVX512F-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; AVX512F-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 +; AVX512F-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 +; AVX512F-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 +; AVX512F-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX512F-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -105,19 +105,23 @@ ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 ; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512F-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 ; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 +; AVX512F-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 +; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 +; AVX512F-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 +; AVX512F-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( @@ -255,35 +259,43 @@ ; ; AVX512F-LABEL: @gather_load_3( ; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512F-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 ; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i64 1 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i64 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3 -; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 -; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i64 0 -; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i64 1 -; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i64 2 -; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 -; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], -; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 +; AVX512F-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 +; AVX512F-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512F-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 +; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 +; AVX512F-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX512F-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 +; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512F-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 +; AVX512F-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; AVX512F-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512F-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 +; AVX512F-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; AVX512F-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 +; AVX512F-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( @@ -457,13 +469,19 @@ ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( +; AVX512F-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512F-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512F-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 +; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 ; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 +; AVX512F-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512F-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] @@ -473,20 +491,22 @@ ; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 -; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i64 1 -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i64 3 -; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* -; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 +; AVX512F-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 +; AVX512F-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 +; AVX512F-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; AVX512F-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 +; AVX512F-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 +; AVX512F-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 +; AVX512F-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX512F-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4(