diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1553,7 +1553,7 @@ Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - enum EntryState { Vectorize, NeedToGather }; + enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; EntryState State; /// Does this sequence require some shuffling? @@ -2841,6 +2841,14 @@ } return; } + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + TE->State = TreeEntry::ScatterVectorize; + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + return; } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); @@ -3682,9 +3690,17 @@ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, - CostKind, VL0); + int VecLdCost; + if (E->State == TreeEntry::Vectorize) { + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + CostKind, VL0); + } else { + assert(E->TreeEntry == TreeEntry::ScatterVectorize && + "Unknown EntryState"); + VecLdCost = TTI->getGatherScatterOpCost( + Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), + /*VariableMask=*/false, alignment, CostKind, VL0); + } if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( @@ -4505,20 +4521,28 @@ setInsertPointAfterBundle(E); LoadInst *LI = cast(VL0); + Instruction *NewLI; unsigned AS = LI->getPointerAddressSpace(); + Value *PO = LI->getPointerOperand(); + if (E->State == TreeEntry::Vectorize) { - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo(AS)); + Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); - // The pointer operand uses an in-tree scalar so we add the new BitCast to - // ExternalUses list to make sure that an extract will be generated in the - // future. - Value *PO = LI->getPointerOperand(); - if (getTreeEntry(PO)) - ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); + // The pointer operand uses an in-tree scalar so we add the new BitCast + // to ExternalUses list to make sure that an extract will be generated + // in the future. + if (getTreeEntry(PO)) + ExternalUses.emplace_back(PO, cast(VecPtr), 0); + + NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); + } else { + assert(E->TreeEntry == TreeEntry::ScatterVectorize && + "Unknown EntryState"); + Value *VecPtr = vectorizeTree(E->getOperand(0)); + NewLI = Builder.CreateMaskedGather(VecPtr, LI->getAlign()); + } + Value *V = propagateMetadata(NewLI, E->Scalars); - LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); - Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { SmallVector Mask; inversePermutation(E->ReorderIndices, Mask); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -229,35 +229,34 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) { ; CHECK-LABEL: @lookahead_external_uses( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 -; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 +; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 -; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> undef, double* [[A]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[A]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 -; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> , <2 x double> undef) +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double*> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A1]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP12]], [[TMP11]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8 ; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -328,31 +327,27 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] +; CHECK-NEXT: [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]] +; CHECK-NEXT: [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]] +; CHECK-NEXT: [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]] +; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]] +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8 +; CHECK-NEXT: store double [[ADD0]], double* [[IDXS0]], align 8 +; CHECK-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 +; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], double* [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], double* [[EXT3:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -10,18 +10,24 @@ @a = global [8 x i32] zeroinitializer, align 16 define void @foo() { -; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 -; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8 -; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 1), align 4 -; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 2), align 8 -; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4 -; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 16 -; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4 -; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 8 -; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @foo( +; SSE-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 +; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16 +; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8 +; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 1), align 4 +; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 2), align 8 +; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4 +; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 16 +; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4 +; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 8 +; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4 +; SSE-NEXT: ret void +; +; AVX-LABEL: @foo( +; AVX-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> , i32 16, <2 x i1> , <2 x i32> undef) +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 +; AVX-NEXT: ret void ; %1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 store i32 %1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -214,34 +214,104 @@ ; SSE-NEXT: store i32 [[T32]], i32* [[T29]], align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4 -; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4 -; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4 -; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4 -; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4 -; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4 -; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4 -; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[T3]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 -; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @gather_load_4( +; AVX1-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 +; AVX1-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX1-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 +; AVX1-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX1-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 +; AVX1-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX1-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; AVX1-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 +; AVX1-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; AVX1-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX1-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 +; AVX1-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX1-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX1-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX1-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4 +; AVX1-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4 +; AVX1-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4 +; AVX1-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4 +; AVX1-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4 +; AVX1-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4 +; AVX1-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4 +; AVX1-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4 +; AVX1-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 +; AVX1-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 +; AVX1-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 +; AVX1-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; AVX1-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 +; AVX1-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 +; AVX1-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 +; AVX1-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX1-NEXT: store i32 [[T4]], i32* [[T0]], align 4 +; AVX1-NEXT: store i32 [[T8]], i32* [[T5]], align 4 +; AVX1-NEXT: store i32 [[T12]], i32* [[T9]], align 4 +; AVX1-NEXT: store i32 [[T16]], i32* [[T13]], align 4 +; AVX1-NEXT: store i32 [[T20]], i32* [[T17]], align 4 +; AVX1-NEXT: store i32 [[T24]], i32* [[T21]], align 4 +; AVX1-NEXT: store i32 [[T28]], i32* [[T25]], align 4 +; AVX1-NEXT: store i32 [[T32]], i32* [[T29]], align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @gather_load_4( +; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4 +; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef) +; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4 +; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4 +; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4 +; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 +; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 +; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 +; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4 +; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 +; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4 +; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4 +; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @gather_load_4( +; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 +; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4 +; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef) +; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4 +; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4 +; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4 +; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 +; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 +; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 +; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4 +; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* +; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 +; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4 +; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4 +; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4 +; AVX512-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 %t6 = getelementptr inbounds i32, i32* %t1, i64 11