diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1577,6 +1577,9 @@ /// The index of this treeEntry in VectorizableTree. int Idx = -1; + /// Boolean value indicating that pointer operands are scattered. + bool IsScatteredOps = false; + private: /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the @@ -2841,6 +2844,13 @@ } return; } + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + TE->setOperandsInOrder(); + TE->IsScatteredOps = true; + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + return; } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); @@ -3648,9 +3658,15 @@ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, - CostKind, VL0); + int VecLdCost; + if (!E->IsScatteredOps) { + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + CostKind, VL0); + } else { + VecLdCost = TTI->getGatherScatterOpCost( + Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), + false, alignment, CostKind, VL0); + } if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( @@ -4470,21 +4486,42 @@ VL0 = E->getMainOp(); setInsertPointAfterBundle(E); + Value *V; LoadInst *LI = cast(VL0); unsigned AS = LI->getPointerAddressSpace(); - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo(AS)); + if (!E->IsScatteredOps) { + Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo(AS)); - // The pointer operand uses an in-tree scalar so we add the new BitCast to - // ExternalUses list to make sure that an extract will be generated in the - // future. - Value *PO = LI->getPointerOperand(); - if (getTreeEntry(PO)) - ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); + // The pointer operand uses an in-tree scalar so we add the new BitCast + // to ExternalUses list to make sure that an extract will be generated + // in the future. + Value *PO = LI->getPointerOperand(); + if (getTreeEntry(PO)) + ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); + + LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); + V = propagateMetadata(LI, E->Scalars); + } else { + Value *Val0 = LI->getPointerOperand(); + FixedVectorType *VecTy = + FixedVectorType::get(Val0->getType(), E->Scalars.size()); + Value *VecPtr = UndefValue::get(VecTy); + unsigned InsIndex = 0; + for (Value *Val : E->Scalars) { + Value *PO = cast(Val)->getPointerOperand(); + VecPtr = Builder.CreateInsertElement(VecPtr, PO, + Builder.getInt32(InsIndex)); + if (getTreeEntry(PO)) + ExternalUses.push_back( + ExternalUser(PO, cast(VecPtr), InsIndex)); + InsIndex++; + } + Instruction *MG = Builder.CreateMaskedGather(VecPtr, LI->getAlign()); + V = propagateMetadata(MG, E->Scalars); + } - LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); - Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { SmallVector Mask; inversePermutation(E->ReorderIndices, Mask); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -237,27 +237,26 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 -; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> undef, double* [[IDXA0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[IDXA2]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP1]], i32 8, <2 x i1> , <2 x double> undef) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP9]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 ; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -328,31 +327,30 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double*> undef, double* [[IDXB0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double*> [[TMP2]], double* [[IDXB2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP3]], i32 8, <2 x i1> , <2 x double> undef) ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP10]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8 -; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: store double [[TMP13]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[TMP13]], double* [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[TMP13]], double* [[EXT3:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -4,19 +4,16 @@ define void @gather_load(i32* %0, i32* readonly %1) { ; CHECK-LABEL: @gather_load( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32*> [[TMP6]], i32* [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32*> [[TMP7]], i32* [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32*> [[TMP8]], i32* [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP9]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll @@ -68,11 +68,13 @@ %add24 = add nsw i32 %add10, %add17 ; YAML: Pass: slp-vectorizer - ; YAML-NEXT: Name: NotPossible + ; YAML-NEXT: Name: NotBeneficial ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: - ; YAML-NEXT: - String: 'Cannot SLP vectorize list: vectorization was impossible' - ; YAML-NEXT: - String: ' with available vectorization factors' + ; YAML-NEXT: - String: 'List vectorization was possible but not beneficial with cost ' + ; YAML-NEXT: - Cost: '0' + ; YAML-NEXT: - String: ' >= ' + ; YAML-NEXT: - Treshold: '0' %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 8