Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -390,6 +390,9 @@ bool isLegalMaskedScatter(Type *DataType) const; bool isLegalMaskedGather(Type *DataType) const; + /// Return true if target doesn't mind addresses in vectors. + bool prefersVectorizedAddressing() const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. @@ -780,6 +783,7 @@ virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; virtual bool isLegalMaskedGather(Type *DataType) = 0; + virtual bool prefersVectorizedAddressing() = 0; virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; @@ -969,6 +973,9 @@ bool isLegalMaskedGather(Type *DataType) override { return Impl.isLegalMaskedGather(DataType); } + bool prefersVectorizedAddressing() override { + return Impl.prefersVectorizedAddressing(); + } int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -231,6 +231,8 @@ bool isLegalMaskedGather(Type *DataType) { return false; } + bool prefersVectorizedAddressing() { return true; } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { // Guess that all legal addressing mode are free. Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -143,6 +143,10 @@ return TTIImpl->isLegalMaskedGather(DataType); } +bool TargetTransformInfo::prefersVectorizedAddressing() const { + return TTIImpl->prefersVectorizedAddressing(); +} + int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,6 +55,7 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool prefersVectorizedAddressing() { return false; } bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -325,6 +325,30 @@ unsigned ScalarBits = Ty->getScalarSizeInBits(); + // Div with a constant which is a power of 2 will be converted by + // DAGCombiner to use shifts. With vector shift-element instructions, a + // vector sdiv costs about as much as a scalar one. + const unsigned SDivCostEstimate = 4; + bool SDivPow2 = false; + bool UDivPow2 = false; + if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) && + Args.size() == 2) { + const ConstantInt *CI = nullptr; + if (const Constant *C = dyn_cast(Args[1])) { + if (C->getType()->isVectorTy()) + CI = dyn_cast_or_null(C->getSplatValue()); + else + CI = dyn_cast(C); + } + if (CI != nullptr && + (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) { + if (Opcode == Instruction::SDiv) + SDivPow2 = true; + else + UDivPow2 = true; + } + } + if (Ty->isVectorTy()) { assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type."); unsigned VF = Ty->getVectorNumElements(); @@ -333,10 +357,13 @@ // These vector operations are custom handled, but are still supported // with one instruction per vector, regardless of element size. if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || - Opcode == Instruction::AShr) { + Opcode == Instruction::AShr || UDivPow2) { return NumVectors; } + if (SDivPow2) + return (NumVectors * SDivCostEstimate); + // These FP operations are supported with a single vector instruction for // double (base implementation assumes float generally costs 2). For // FP128, the scalar cost is 1, and there is no overhead since the values @@ -395,6 +422,11 @@ // 2 * ipm sequences ; xor ; shift ; compare return 7; + if (UDivPow2) + return 1; + if (SDivPow2) + return SDivCostEstimate; + // An extra extension for narrow types is needed. if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem)) // sext of op(s) for narrow types Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2107,6 +2107,10 @@ /// The data is collected per VF. DenseMap> Scalars; + /// Holds the instructions (address computations) that are forced to be + /// scalarized. + DenseMap> ForcedScalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -5633,6 +5637,13 @@ DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } + // Insert the forced scalars. + // FIXME: Currently widenPHIInstruction() often creates a dead vector + // induction variable when the PHI user is scalarized. + if (ForcedScalars.count(VF)) + for (auto *I : ForcedScalars.find(VF)->second) + Worklist.insert(I); + // Expand the worklist by looking through any bitcasts and getelementptr // instructions we've already identified as scalar. This is similar to the // expansion step in collectLoopUniforms(); however, here we're only @@ -7236,9 +7247,16 @@ if (VF > 1 && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); + // Forced scalars do not have any scalarization overhead. + if (VF > 1 && ForcedScalars.count(VF) && + ForcedScalars.find(VF)->second.count(I)) + return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); + // Note: Even if all instructions are scalarized, return true if any memory + // accesses appear in the loop to get benefits from address folding etc. bool TypeNotScalarized = VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF; return VectorizationCostTy(C, TypeNotScalarized); @@ -7315,6 +7333,52 @@ setWideningDecision(&I, VF, Decision, Cost); } } + + // Make sure that any load of address and any other address computation + // remains scalar unless there is gather/scatter support. This avoids + // inevitable extracts into address registers, and also has the benefit of + // activating LSR more, since that pass can't optimize vectorized + // addresses. + if (TTI.prefersVectorizedAddressing()) + return; + + // Start with all scalar pointer uses. + SmallPtrSet AddrDefs; + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + Instruction *PtrDef = nullptr; + if (auto *LoadI = dyn_cast(&I)) + PtrDef = dyn_cast(LoadI->getPointerOperand()); + if (auto *StoreI = dyn_cast(&I)) + PtrDef = dyn_cast(StoreI->getPointerOperand()); + if (PtrDef && TheLoop->contains(PtrDef)) + AddrDefs.insert(PtrDef); + } + + // Add all instructions used to generate the addresses. + SmallVector Worklist; + for (auto *I : AddrDefs) + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (auto &Op : I->operands()) + if (auto *InstOp = dyn_cast(Op)) + if (TheLoop->contains(InstOp) && !isa(InstOp) && + AddrDefs.insert(InstOp).second == true) + Worklist.push_back(InstOp); + } + + for (auto *I : AddrDefs) { + if (isa(I)) { + if (getWideningDecision(I, VF) != CM_Scalarize) + // Scalarize a load of address + setWideningDecision(I, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(I, 1))); + } else + // Make sure I gets scalarized and a cost estimate without + // scalarization overhead. + ForcedScalars[VF].insert(I); + } } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Index: test/Analysis/CostModel/SystemZ/div-pow2.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/SystemZ/div-pow2.ll @@ -0,0 +1,154 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s + +; Scalar sdiv + +define i64 @fun0(i64 %a) { + %r = sdiv i64 %a, 2 + ret i64 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i64 %a, 2 +} + +define i64 @fun1(i64 %a) { + %r = sdiv i64 %a, -4 + ret i64 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i64 %a, -4 +} + +define i32 @fun2(i32 %a) { + %r = sdiv i32 %a, 8 + ret i32 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i32 %a, 8 +} + +define i32 @fun3(i32 %a) { + %r = sdiv i32 %a, -16 + ret i32 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i32 %a, -16 +} + +define i16 @fun4(i16 %a) { + %r = sdiv i16 %a, 32 + ret i16 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i16 %a, 32 +} + +define i16 @fun5(i16 %a) { + %r = sdiv i16 %a, -64 + ret i16 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i16 %a, -64 +} + +define i8 @fun6(i8 %a) { + %r = sdiv i8 %a, 64 + ret i8 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i8 %a, 64 +} + +define i8 @fun7(i8 %a) { + %r = sdiv i8 %a, -128 + ret i8 %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i8 %a, -128 +} + + +; Vector sdiv + +define <2 x i64> @fun8(<2 x i64> %a) { + %r = sdiv <2 x i64> %a, + ret <2 x i64> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <2 x i64> %a, +} + +define <2 x i64> @fun9(<2 x i64> %a) { + %r = sdiv <2 x i64> %a, + ret <2 x i64> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <2 x i64> %a, +} + +define <4 x i32> @fun10(<4 x i32> %a) { + %r = sdiv <4 x i32> %a, + ret <4 x i32> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <4 x i32> %a, +} + +define <4 x i32> @fun11(<4 x i32> %a) { + %r = sdiv <4 x i32> %a, + ret <4 x i32> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <4 x i32> %a, @fun12(<8 x i16> %a) { + %r = sdiv <8 x i16> %a, + ret <8 x i16> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <8 x i16> %a, @fun13(<8 x i16> %a) { + %r = sdiv <8 x i16> %a, + ret <8 x i16> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <8 x i16> %a, @fun14(<16 x i8> %a) { + %r = sdiv <16 x i8> %a, + ret <16 x i8> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <16 x i8> %a, @fun15(<16 x i8> %a) { + %r = sdiv <16 x i8> %a, + ret <16 x i8> %r +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <16 x i8> %a, @fun20(<2 x i64> %a) { + %r = udiv <2 x i64> %a, + ret <2 x i64> %r +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <2 x i64> %a, @fun21(<4 x i32> %a) { + %r = udiv <4 x i32> %a, + ret <4 x i32> %r +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <4 x i32> %a, @fun22(<8 x i16> %a) { + %r = udiv <8 x i16> %a, + ret <8 x i16> %r +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <8 x i16> %a, @fun23(<16 x i8> %a) { + %r = udiv <16 x i8> %a, + ret <16 x i8> %r +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <16 x i8> %a, undef, i32 %5, i32 0 +;CHECK: %8 = insertelement <2 x i32> %7, i32 %6, i32 1 +;CHECK: %9 = getelementptr inbounds i32, i32* %A, i64 %index +;CHECK: %10 = bitcast i32* %9 to <2 x i32>* +;CHECK: store <2 x i32> %8, <2 x i32>* %10, align 4 + +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr = getelementptr inbounds i32*, i32** %PtrPtr, i64 %indvars.iv + %el = load i32*, i32** %ptr + %v = load i32, i32* %el + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %v, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret i32 undef +}