Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -434,6 +434,11 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) const; + /// If target has efficient vector element load/store instructions, it can + /// return true here so that insertion/extraction costs are not added to + /// the scalarization cost of a load/store. + bool supportsEfficientVectorElementLoadStore() const; + /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; @@ -781,6 +786,7 @@ getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; + virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; @@ -985,6 +991,10 @@ return Impl.getOperandsScalarizationOverhead(Args, VF); } + bool supportsEfficientVectorElementLoadStore() override { + return Impl.supportsEfficientVectorElementLoadStore(); + } + bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -262,6 +262,8 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { return 0; } + bool supportsEfficientVectorElementLoadStore() { return false; } + bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } bool enableInterleavedAccessVectorization() { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -197,6 +197,10 @@ return TTIImpl->getOperandsScalarizationOverhead(Args, VF); } +bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { + return TTIImpl->supportsEfficientVectorElementLoadStore(); +} + bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const { return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,6 +55,7 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } int getArithmeticInstrCost( Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3761,13 +3761,17 @@ unsigned Cost = 0; Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy()) + if (!RetTy->isVoidTy() && + (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead(RetTy, true, false); if (CallInst *CI = dyn_cast(I)) { SmallVector Operands(CI->arg_operands()); Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } else { + } + else if (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { SmallVector Operands(I->operand_values()); Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); } Index: test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll @@ -0,0 +1,33 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \ +; RUN: -force-vector-width=4 -debug-only=loop-vectorize \ +; RUN: -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \ +; RUN: FileCheck %s +; +; Check that a scalarized load/store does not get a cost for insterts/ +; extracts, since z13 supports element load/store. + +define void @fun(i32* %data, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i + %tmp1 = load i32, i32* %tmp0, align 4 + %tmp2 = add i32 %tmp1, 1 + store i32 %tmp2, i32* %tmp0, align 4 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void + +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, i32* %tmp0, align 4 + +; CHECK: LV: Scalarizing: %tmp1 = load i32, i32* %tmp0, align 4 +; CHECK: LV: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 +} +