diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll --- a/clang/test/CodeGen/thinlto-distributed-newpm.ll +++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll @@ -62,13 +62,13 @@ ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Running analysis: TargetLibraryAnalysis on main ; CHECK-O: Running analysis: OptimizationRemarkEmitterAnalysis on main +; CHECK-O: Running analysis: TargetIRAnalysis on main ; CHECK-O: Running analysis: AAManager on main ; CHECK-O: Running analysis: BasicAA on main ; CHECK-O: Running analysis: ScopedNoAliasAA on main ; CHECK-O: Running analysis: TypeBasedAA on main ; CHECK-O: Running analysis: OuterAnalysisManagerProxy ; CHECK-O: Running pass: SimplifyCFGPass on main -; CHECK-O: Running analysis: TargetIRAnalysis on main ; CHECK-O: Finished {{.*}}Function pass manager run. ; CHECK-O: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA ; CHECK-O: Running analysis: GlobalsAA diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -42,6 +42,7 @@ class ExtractElementInst; class Function; class GlobalValue; +class InstCombiner; class IntrinsicInst; class LoadInst; class LoopAccessInfo; @@ -56,6 +57,7 @@ class Type; class User; class Value; +struct KnownBits; template class Optional; /// Information about a load/store intrinsic defined by the target. @@ -542,6 +544,29 @@ /// target-independent defaults with information from \p L and \p SE. void getPeelingPreferences(Loop *L, ScalarEvolution &SE, PeelingPreferences &PP) const; + + /// Targets can implement their own combinations for target-specific + /// intrinsics. This function will be called from the InstCombine pass every + /// time a target-specific intrinsic is encountered. + /// + /// \returns None to not do anything target specific or a value that will be + /// returned from the InstCombiner. It is possible to return null and stop + /// further processing of the intrinsic by returning nullptr. + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + /// Can be used to implement target-specific instruction combining. + /// \see instCombineIntrinsic + Optional + simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, + APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const; + /// Can be used to implement target-specific instruction combining. + /// \see instCombineIntrinsic + Optional simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) const; /// @} /// \name Scalar Target Information @@ -1301,6 +1326,17 @@ AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; virtual bool emitGetActiveLaneMask() = 0; + virtual Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) = 0; + virtual Optional + simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, + APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) = 0; + virtual Optional simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1588,6 +1624,26 @@ bool emitGetActiveLaneMask() override { return Impl.emitGetActiveLaneMask(); } + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) override { + return Impl.instCombineIntrinsic(IC, II); + } + Optional + simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, + APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) override { + return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, + KnownBitsComputed); + } + Optional simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) override { + return Impl.simplifyDemandedVectorEltsIntrinsic( + IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, + SimplifyAndSetOp); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -147,6 +147,26 @@ return false; } + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const { + return None; + } + + Optional + simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, + APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const { + return None; + } + + Optional simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) const { + return None; + } + void getUnrollingPreferences(Loop *, ScalarEvolution &, TTI::UnrollingPreferences &) {} diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -477,6 +477,30 @@ return BaseT::emitGetActiveLaneMask(); } + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) { + return BaseT::instCombineIntrinsic(IC, II); + } + + Optional simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, + IntrinsicInst &II, + APInt DemandedMask, + KnownBits &Known, + bool &KnownBitsComputed) { + return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, + KnownBitsComputed); + } + + Optional simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) { + return BaseT::simplifyDemandedVectorEltsIntrinsic( + IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, + SimplifyAndSetOp); + } + int getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; @@ -1605,7 +1629,7 @@ } } - auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); + auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); if (MinLegalCostI != LegalCost.end()) return *MinLegalCostI; diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -199,6 +199,11 @@ /// returns Intrinsic::not_intrinsic! bool isIntrinsic() const { return HasLLVMReservedName; } + /// isTargetIntrinsic - Returns true if this function is an intrinsic and the + /// intrinsic is specific to a certain target. If this is not an intrinsic + /// or a generic intrinsic, false is returned. + bool isTargetIntrinsic() const; + /// Returns true if the function is one of the "Constrained Floating-Point /// Intrinsics". Returns false if not, and returns false when /// getIntrinsicID() returns Intrinsic::not_intrinsic. diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -0,0 +1,518 @@ +//===- InstCombiner.h - InstCombine implementation --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file provides the interface for the instcombine pass implementation. +/// The interface is used for generic transformations in this folder and +/// target specific combinations in the targets. +/// The visitor implementation is in \c InstCombinerImpl in +/// \c InstCombineInternal.h. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H +#define LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H + +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/TargetFolder.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include + +#define DEBUG_TYPE "instcombine" + +namespace llvm { + +class AAResults; +class AssumptionCache; +class ProfileSummaryInfo; +class TargetLibraryInfo; +class TargetTransformInfo; +struct KnownBits; + +/// The core instruction combiner logic. +/// +/// This class provides both the logic to recursively visit instructions and +/// combine them. +class LLVM_LIBRARY_VISIBILITY InstCombiner { + /// Only used to call target specific inst combining. + TargetTransformInfo &TTI; + +public: + /// Maximum size of array considered when transforming. + uint64_t MaxArraySizeForCombine = 0; + + /// An IRBuilder that automatically inserts new instructions into the + /// worklist. + using BuilderTy = IRBuilder; + BuilderTy &Builder; + +protected: + /// A worklist of the instructions that need to be simplified. + InstCombineWorklist &Worklist; + + // Mode in which we are running the combiner. + const bool MinimizeSize; + + AAResults *AA; + + // Required analyses. + AssumptionCache &AC; + TargetLibraryInfo &TLI; + DominatorTree &DT; + const DataLayout &DL; + const SimplifyQuery SQ; + OptimizationRemarkEmitter &ORE; + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; + + // Optional analyses. When non-null, these can both be used to do better + // combining and will be updated to reflect any changes. + LoopInfo *LI; + + bool MadeIRChange = false; + +public: + InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder, + bool MinimizeSize, AAResults *AA, AssumptionCache &AC, + TargetLibraryInfo &TLI, TargetTransformInfo &TTI, + DominatorTree &DT, OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + const DataLayout &DL, LoopInfo *LI) + : TTI(TTI), Builder(Builder), Worklist(Worklist), + MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), + SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} + + virtual ~InstCombiner() {} + + /// Return the source operand of a potentially bitcasted value while + /// optionally checking if it has one use. If there is no bitcast or the one + /// use check is not met, return the input value itself. + static Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) { + if (auto *BitCast = dyn_cast(V)) + if (!OneUseOnly || BitCast->hasOneUse()) + return BitCast->getOperand(0); + + // V is not a bitcast or V has more than one use and OneUseOnly is true. + return V; + } + + /// Assign a complexity or rank value to LLVM Values. This is used to reduce + /// the amount of pattern matching needed for compares and commutative + /// instructions. For example, if we have: + /// icmp ugt X, Constant + /// or + /// xor (add X, Constant), cast Z + /// + /// We do not have to consider the commuted variants of these patterns because + /// canonicalization based on complexity guarantees the above ordering. + /// + /// This routine maps IR values to various complexity ranks: + /// 0 -> undef + /// 1 -> Constants + /// 2 -> Other non-instructions + /// 3 -> Arguments + /// 4 -> Cast and (f)neg/not instructions + /// 5 -> Other instructions + static unsigned getComplexity(Value *V) { + if (isa(V)) { + if (isa(V) || match(V, m_Neg(PatternMatch::m_Value())) || + match(V, m_Not(PatternMatch::m_Value())) || + match(V, m_FNeg(PatternMatch::m_Value()))) + return 4; + return 5; + } + if (isa(V)) + return 3; + return isa(V) ? (isa(V) ? 0 : 1) : 2; + } + + /// Predicate canonicalization reduces the number of patterns that need to be + /// matched by other transforms. For example, we may swap the operands of a + /// conditional branch or select to create a compare with a canonical + /// (inverted) predicate which is then more likely to be matched with other + /// values. + static bool isCanonicalPredicate(CmpInst::Predicate Pred) { + switch (Pred) { + case CmpInst::ICMP_NE: + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + // TODO: There are 16 FCMP predicates. Should others be (not) canonical? + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_OGE: + return false; + default: + return true; + } + } + + /// Given an exploded icmp instruction, return true if the comparison only + /// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if + /// the result of the comparison is true when the input value is signed. + static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS, + bool &TrueIfSigned) { + switch (Pred) { + case ICmpInst::ICMP_SLT: // True if LHS s< 0 + TrueIfSigned = true; + return RHS.isNullValue(); + case ICmpInst::ICMP_SLE: // True if LHS s<= -1 + TrueIfSigned = true; + return RHS.isAllOnesValue(); + case ICmpInst::ICMP_SGT: // True if LHS s> -1 + TrueIfSigned = false; + return RHS.isAllOnesValue(); + case ICmpInst::ICMP_SGE: // True if LHS s>= 0 + TrueIfSigned = false; + return RHS.isNullValue(); + case ICmpInst::ICMP_UGT: + // True if LHS u> RHS and RHS == sign-bit-mask - 1 + TrueIfSigned = true; + return RHS.isMaxSignedValue(); + case ICmpInst::ICMP_UGE: + // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) + TrueIfSigned = true; + return RHS.isMinSignedValue(); + case ICmpInst::ICMP_ULT: + // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) + TrueIfSigned = false; + return RHS.isMinSignedValue(); + case ICmpInst::ICMP_ULE: + // True if LHS u<= RHS and RHS == sign-bit-mask - 1 + TrueIfSigned = false; + return RHS.isMaxSignedValue(); + default: + return false; + } + } + + /// Add one to a Constant + static Constant *AddOne(Constant *C) { + return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); + } + + /// Subtract one from a Constant + static Constant *SubOne(Constant *C) { + return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); + } + + llvm::Optional> static getFlippedStrictnessPredicateAndConstant(CmpInst:: + Predicate + Pred, + Constant *C); + + /// Return true if the specified value is free to invert (apply ~ to). + /// This happens in cases where the ~ can be eliminated. If WillInvertAllUses + /// is true, work under the assumption that the caller intends to remove all + /// uses of V and only keep uses of ~V. + /// + /// See also: canFreelyInvertAllUsersOf() + static bool isFreeToInvert(Value *V, bool WillInvertAllUses) { + // ~(~(X)) -> X. + if (match(V, m_Not(PatternMatch::m_Value()))) + return true; + + // Constants can be considered to be not'ed values. + if (match(V, PatternMatch::m_AnyIntegralConstant())) + return true; + + // Compares can be inverted if all of their uses are being modified to use + // the ~V. + if (isa(V)) + return WillInvertAllUses; + + // If `V` is of the form `A + Constant` then `-1 - V` can be folded into + // `(-1 - Constant) - A` if we are willing to invert all of the uses. + if (BinaryOperator *BO = dyn_cast(V)) + if (BO->getOpcode() == Instruction::Add || + BO->getOpcode() == Instruction::Sub) + if (isa(BO->getOperand(0)) || + isa(BO->getOperand(1))) + return WillInvertAllUses; + + // Selects with invertible operands are freely invertible + if (match(V, + m_Select(PatternMatch::m_Value(), m_Not(PatternMatch::m_Value()), + m_Not(PatternMatch::m_Value())))) + return WillInvertAllUses; + + return false; + } + + /// Given i1 V, can every user of V be freely adapted if V is changed to !V ? + /// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this + /// fn. + /// + /// See also: isFreeToInvert() + static bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) { + // Look at every user of V. + for (Use &U : V->uses()) { + if (U.getUser() == IgnoredUser) + continue; // Don't consider this user. + + auto *I = cast(U.getUser()); + switch (I->getOpcode()) { + case Instruction::Select: + if (U.getOperandNo() != 0) // Only if the value is used as select cond. + return false; + break; + case Instruction::Br: + assert(U.getOperandNo() == 0 && "Must be branching on that value."); + break; // Free to invert by swapping true/false values/destinations. + case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring + // it. + if (!match(I, m_Not(PatternMatch::m_Value()))) + return false; // Not a 'not'. + break; + default: + return false; // Don't know, likely not freely invertible. + } + // So far all users were free to invert... + } + return true; // Can freely invert all users! + } + + /// Some binary operators require special handling to avoid poison and + /// undefined behavior. If a constant vector has undef elements, replace those + /// undefs with identity constants if possible because those are always safe + /// to execute. If no identity constant exists, replace undef with some other + /// safe constant. + static Constant * + getSafeVectorConstantForBinop(BinaryOperator::BinaryOps Opcode, Constant *In, + bool IsRHSConstant) { + auto *InVTy = dyn_cast(In->getType()); + assert(InVTy && "Not expecting scalars here"); + + Type *EltTy = InVTy->getElementType(); + auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant); + if (!SafeC) { + // TODO: Should this be available as a constant utility function? It is + // similar to getBinOpAbsorber(). + if (IsRHSConstant) { + switch (Opcode) { + case Instruction::SRem: // X % 1 = 0 + case Instruction::URem: // X %u 1 = 0 + SafeC = ConstantInt::get(EltTy, 1); + break; + case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe) + SafeC = ConstantFP::get(EltTy, 1.0); + break; + default: + llvm_unreachable( + "Only rem opcodes have no identity constant for RHS"); + } + } else { + switch (Opcode) { + case Instruction::Shl: // 0 << X = 0 + case Instruction::LShr: // 0 >>u X = 0 + case Instruction::AShr: // 0 >> X = 0 + case Instruction::SDiv: // 0 / X = 0 + case Instruction::UDiv: // 0 /u X = 0 + case Instruction::SRem: // 0 % X = 0 + case Instruction::URem: // 0 %u X = 0 + case Instruction::Sub: // 0 - X (doesn't simplify, but it is safe) + case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe) + case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe) + case Instruction::FRem: // 0.0 % X = 0 + SafeC = Constant::getNullValue(EltTy); + break; + default: + llvm_unreachable("Expected to find identity constant for opcode"); + } + } + } + assert(SafeC && "Must have safe constant for binop"); + unsigned NumElts = InVTy->getNumElements(); + SmallVector Out(NumElts); + for (unsigned i = 0; i != NumElts; ++i) { + Constant *C = In->getAggregateElement(i); + Out[i] = isa(C) ? SafeC : C; + } + return ConstantVector::get(Out); + } + + /// Create and insert the idiom we use to indicate a block is unreachable + /// without having to rewrite the CFG from within InstCombine. + static void CreateNonTerminatorUnreachable(Instruction *InsertAt) { + auto &Ctx = InsertAt->getContext(); + new StoreInst(ConstantInt::getTrue(Ctx), + UndefValue::get(Type::getInt1PtrTy(Ctx)), InsertAt); + } + + void addToWorklist(Instruction *I) { Worklist.push(I); } + + AssumptionCache &getAssumptionCache() const { return AC; } + TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; } + DominatorTree &getDominatorTree() const { return DT; } + const DataLayout &getDataLayout() const { return DL; } + const SimplifyQuery &getSimplifyQuery() const { return SQ; } + OptimizationRemarkEmitter &getOptimizationRemarkEmitter() const { + return ORE; + } + BlockFrequencyInfo *getBlockFrequencyInfo() const { return BFI; } + ProfileSummaryInfo *getProfileSummaryInfo() const { return PSI; } + LoopInfo *getLoopInfo() const { return LI; } + + // Call target specific combiners + Optional targetInstCombineIntrinsic(IntrinsicInst &II); + Optional + targetSimplifyDemandedUseBitsIntrinsic(IntrinsicInst &II, APInt DemandedMask, + KnownBits &Known, + bool &KnownBitsComputed); + Optional targetSimplifyDemandedVectorEltsIntrinsic( + IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp); + + /// Inserts an instruction \p New before instruction \p Old + /// + /// Also adds the new instruction to the worklist and returns \p New so that + /// it is suitable for use as the return from the visitation patterns. + Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) { + assert(New && !New->getParent() && + "New instruction already inserted into a basic block!"); + BasicBlock *BB = Old.getParent(); + BB->getInstList().insert(Old.getIterator(), New); // Insert inst + Worklist.push(New); + return New; + } + + /// Same as InsertNewInstBefore, but also sets the debug loc. + Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) { + New->setDebugLoc(Old.getDebugLoc()); + return InsertNewInstBefore(New, Old); + } + + /// A combiner-aware RAUW-like routine. + /// + /// This method is to be used when an instruction is found to be dead, + /// replaceable with another preexisting expression. Here we add all uses of + /// I to the worklist, replace all uses of I with the new value, then return + /// I, so that the inst combiner will know that I was modified. + Instruction *replaceInstUsesWith(Instruction &I, Value *V) { + // If there are no uses to replace, then we return nullptr to indicate that + // no changes were made to the program. + if (I.use_empty()) + return nullptr; + + Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist. + + // If we are replacing the instruction with itself, this must be in a + // segment of unreachable code, so just clobber the instruction. + if (&I == V) + V = UndefValue::get(I.getType()); + + LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n" + << " with " << *V << '\n'); + + I.replaceAllUsesWith(V); + return &I; + } + + /// Replace operand of instruction and add old operand to the worklist. + Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) { + Worklist.addValue(I.getOperand(OpNum)); + I.setOperand(OpNum, V); + return &I; + } + + /// Replace use and add the previously used value to the worklist. + void replaceUse(Use &U, Value *NewValue) { + Worklist.addValue(U); + U = NewValue; + } + + /// Combiner aware instruction erasure. + /// + /// When dealing with an instruction that has side effects or produces a void + /// value, we can't rely on DCE to delete the instruction. Instead, visit + /// methods should return the value returned by this function. + virtual Instruction *eraseInstFromFunction(Instruction &I) = 0; + + void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth, + const Instruction *CxtI) const { + llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); + } + + KnownBits computeKnownBits(const Value *V, unsigned Depth, + const Instruction *CxtI) const { + return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT); + } + + bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false, + unsigned Depth = 0, + const Instruction *CxtI = nullptr) { + return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT); + } + + bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0, + const Instruction *CxtI = nullptr) const { + return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT); + } + + unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0, + const Instruction *CxtI = nullptr) const { + return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForUnsignedMul(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForUnsignedAdd(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForSignedAdd(const Value *LHS, const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForUnsignedSub(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT); + } + + OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT); + } + + virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, + const APInt &DemandedMask, KnownBits &Known, + unsigned Depth = 0) = 0; + virtual Value * + SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, + unsigned Depth = 0, + bool AllowMultipleUsers = false) = 0; +}; + +} // namespace llvm + +#undef DEBUG_TYPE + +#endif diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -322,6 +322,29 @@ return TTIImpl->emitGetActiveLaneMask(); } +Optional +TargetTransformInfo::instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const { + return TTIImpl->instCombineIntrinsic(IC, II); +} + +Optional TargetTransformInfo::simplifyDemandedUseBitsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const { + return TTIImpl->simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, + KnownBitsComputed); +} + +Optional TargetTransformInfo::simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) const { + return TTIImpl->simplifyDemandedVectorEltsIntrinsic( + IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, + SimplifyAndSetOp); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -618,6 +618,10 @@ #include "llvm/IR/IntrinsicImpl.inc" #undef GET_INTRINSIC_TARGET_DATA +bool Function::isTargetIntrinsic() const { + return IntID > TargetInfos[0].Count; +} + /// Find the segment of \c IntrinsicNameTable for intrinsics with the same /// target as \c Name, or the generic table if \c Name is not target specific. /// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -0,0 +1,896 @@ +//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// \file +// This file implements a TargetTransformInfo analysis pass specific to the +// AMDGPU target machine. It uses the target's detailed information to provide +// more precise answers to certain TTI queries, while letting the target +// independent and default TTI implementations handle the rest. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetTransformInfo.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +using namespace llvm; + +#define DEBUG_TYPE "AMDGPUtti" + +namespace { + +struct AMDGPUImageDMaskIntrinsic { + unsigned Intr; +}; + +#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL +#include "InstCombineTables.inc" + +} // end anonymous namespace + +// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. +// +// A single NaN input is folded to minnum, so we rely on that folding for +// handling NaNs. +static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, + const APFloat &Src2) { + APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); + + APFloat::cmpResult Cmp0 = Max3.compare(Src0); + assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); + if (Cmp0 == APFloat::cmpEqual) + return maxnum(Src1, Src2); + + APFloat::cmpResult Cmp1 = Max3.compare(Src1); + assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); + if (Cmp1 == APFloat::cmpEqual) + return maxnum(Src0, Src2); + + return maxnum(Src0, Src1); +} + +Optional +GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::amdgcn_rcp: { + Value *Src = II.getArgOperand(0); + + // TODO: Move to ConstantFolding/InstSimplify? + if (isa(Src)) { + Type *Ty = II.getType(); + auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); + return IC.replaceInstUsesWith(II, QNaN); + } + + if (II.isStrictFP()) + break; + + if (const ConstantFP *C = dyn_cast(Src)) { + const APFloat &ArgVal = C->getValueAPF(); + APFloat Val(ArgVal.getSemantics(), 1); + Val.divide(ArgVal, APFloat::rmNearestTiesToEven); + + // This is more precise than the instruction may give. + // + // TODO: The instruction always flushes denormal results (except for f16), + // should this also? + return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); + } + + break; + } + case Intrinsic::amdgcn_rsq: { + Value *Src = II.getArgOperand(0); + + // TODO: Move to ConstantFolding/InstSimplify? + if (isa(Src)) { + Type *Ty = II.getType(); + auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); + return IC.replaceInstUsesWith(II, QNaN); + } + + break; + } + case Intrinsic::amdgcn_frexp_mant: + case Intrinsic::amdgcn_frexp_exp: { + Value *Src = II.getArgOperand(0); + if (const ConstantFP *C = dyn_cast(Src)) { + int Exp; + APFloat Significand = + frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); + + if (IID == Intrinsic::amdgcn_frexp_mant) { + return IC.replaceInstUsesWith( + II, ConstantFP::get(II.getContext(), Significand)); + } + + // Match instruction special case behavior. + if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) + Exp = 0; + + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); + } + + if (isa(Src)) { + return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + } + + break; + } + case Intrinsic::amdgcn_class: { + enum { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; + + const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | + N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | + P_NORMAL | P_INFINITY; + + Value *Src0 = II.getArgOperand(0); + Value *Src1 = II.getArgOperand(1); + const ConstantInt *CMask = dyn_cast(Src1); + if (!CMask) { + if (isa(Src0)) { + return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + } + + if (isa(Src1)) { + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), false)); + } + break; + } + + uint32_t Mask = CMask->getZExtValue(); + + // If all tests are made, it doesn't matter what the value is. + if ((Mask & FullMask) == FullMask) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); + } + + if ((Mask & FullMask) == 0) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); + } + + if (Mask == (S_NAN | Q_NAN)) { + // Equivalent of isnan. Replace with standard fcmp. + Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); + FCmp->takeName(&II); + return IC.replaceInstUsesWith(II, FCmp); + } + + if (Mask == (N_ZERO | P_ZERO)) { + // Equivalent of == 0. + Value *FCmp = + IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); + + FCmp->takeName(&II); + return IC.replaceInstUsesWith(II, FCmp); + } + + // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other + if (((Mask & S_NAN) || (Mask & Q_NAN)) && + isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { + return IC.replaceOperand( + II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); + } + + const ConstantFP *CVal = dyn_cast(Src0); + if (!CVal) { + if (isa(Src0)) { + return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + } + + // Clamp mask to used bits + if ((Mask & FullMask) != Mask) { + CallInst *NewCall = IC.Builder.CreateCall( + II.getCalledFunction(), + {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); + + NewCall->takeName(&II); + return IC.replaceInstUsesWith(II, NewCall); + } + + break; + } + + const APFloat &Val = CVal->getValueAPF(); + + bool Result = + ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || + ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || + ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || + ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || + ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || + ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || + ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || + ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || + ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || + ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); + + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); + } + case Intrinsic::amdgcn_cvt_pkrtz: { + Value *Src0 = II.getArgOperand(0); + Value *Src1 = II.getArgOperand(1); + if (const ConstantFP *C0 = dyn_cast(Src0)) { + if (const ConstantFP *C1 = dyn_cast(Src1)) { + const fltSemantics &HalfSem = + II.getType()->getScalarType()->getFltSemantics(); + bool LosesInfo; + APFloat Val0 = C0->getValueAPF(); + APFloat Val1 = C1->getValueAPF(); + Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); + + Constant *Folded = + ConstantVector::get({ConstantFP::get(II.getContext(), Val0), + ConstantFP::get(II.getContext(), Val1)}); + return IC.replaceInstUsesWith(II, Folded); + } + } + + if (isa(Src0) && isa(Src1)) { + return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + } + + break; + } + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: { + Value *Src0 = II.getArgOperand(0); + Value *Src1 = II.getArgOperand(1); + + if (isa(Src0) && isa(Src1)) { + return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + } + + break; + } + case Intrinsic::amdgcn_ubfe: + case Intrinsic::amdgcn_sbfe: { + // Decompose simple cases into standard shifts. + Value *Src = II.getArgOperand(0); + if (isa(Src)) { + return IC.replaceInstUsesWith(II, Src); + } + + unsigned Width; + Type *Ty = II.getType(); + unsigned IntSize = Ty->getIntegerBitWidth(); + + ConstantInt *CWidth = dyn_cast(II.getArgOperand(2)); + if (CWidth) { + Width = CWidth->getZExtValue(); + if ((Width & (IntSize - 1)) == 0) { + return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); + } + + // Hardware ignores high bits, so remove those. + if (Width >= IntSize) { + return IC.replaceOperand( + II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); + } + } + + unsigned Offset; + ConstantInt *COffset = dyn_cast(II.getArgOperand(1)); + if (COffset) { + Offset = COffset->getZExtValue(); + if (Offset >= IntSize) { + return IC.replaceOperand( + II, 1, + ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); + } + } + + bool Signed = IID == Intrinsic::amdgcn_sbfe; + + if (!CWidth || !COffset) + break; + + // The case of Width == 0 is handled above, which makes this tranformation + // safe. If Width == 0, then the ashr and lshr instructions become poison + // value since the shift amount would be equal to the bit size. + assert(Width != 0); + + // TODO: This allows folding to undef when the hardware has specific + // behavior? + if (Offset + Width < IntSize) { + Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); + Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) + : IC.Builder.CreateLShr(Shl, IntSize - Width); + RightShift->takeName(&II); + return IC.replaceInstUsesWith(II, RightShift); + } + + Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) + : IC.Builder.CreateLShr(Src, Offset); + + RightShift->takeName(&II); + return IC.replaceInstUsesWith(II, RightShift); + } + case Intrinsic::amdgcn_exp: + case Intrinsic::amdgcn_exp_compr: { + ConstantInt *En = cast(II.getArgOperand(1)); + unsigned EnBits = En->getZExtValue(); + if (EnBits == 0xf) + break; // All inputs enabled. + + bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; + bool Changed = false; + for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { + if ((!IsCompr && (EnBits & (1 << I)) == 0) || + (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { + Value *Src = II.getArgOperand(I + 2); + if (!isa(Src)) { + IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); + Changed = true; + } + } + } + + if (Changed) { + return &II; + } + + break; + } + case Intrinsic::amdgcn_fmed3: { + // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled + // for the shader. + + Value *Src0 = II.getArgOperand(0); + Value *Src1 = II.getArgOperand(1); + Value *Src2 = II.getArgOperand(2); + + // Checking for NaN before canonicalization provides better fidelity when + // mapping other operations onto fmed3 since the order of operands is + // unchanged. + CallInst *NewCall = nullptr; + if (match(Src0, PatternMatch::m_NaN()) || isa(Src0)) { + NewCall = IC.Builder.CreateMinNum(Src1, Src2); + } else if (match(Src1, PatternMatch::m_NaN()) || isa(Src1)) { + NewCall = IC.Builder.CreateMinNum(Src0, Src2); + } else if (match(Src2, PatternMatch::m_NaN()) || isa(Src2)) { + NewCall = IC.Builder.CreateMaxNum(Src0, Src1); + } + + if (NewCall) { + NewCall->copyFastMathFlags(&II); + NewCall->takeName(&II); + return IC.replaceInstUsesWith(II, NewCall); + } + + bool Swap = false; + // Canonicalize constants to RHS operands. + // + // fmed3(c0, x, c1) -> fmed3(x, c0, c1) + if (isa(Src0) && !isa(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (isa(Src1) && !isa(Src2)) { + std::swap(Src1, Src2); + Swap = true; + } + + if (isa(Src0) && !isa(Src1)) { + std::swap(Src0, Src1); + Swap = true; + } + + if (Swap) { + II.setArgOperand(0, Src0); + II.setArgOperand(1, Src1); + II.setArgOperand(2, Src2); + return &II; + } + + if (const ConstantFP *C0 = dyn_cast(Src0)) { + if (const ConstantFP *C1 = dyn_cast(Src1)) { + if (const ConstantFP *C2 = dyn_cast(Src2)) { + APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), + C2->getValueAPF()); + return IC.replaceInstUsesWith( + II, ConstantFP::get(IC.Builder.getContext(), Result)); + } + } + } + + break; + } + case Intrinsic::amdgcn_icmp: + case Intrinsic::amdgcn_fcmp: { + const ConstantInt *CC = cast(II.getArgOperand(2)); + // Guard against invalid arguments. + int64_t CCVal = CC->getZExtValue(); + bool IsInteger = IID == Intrinsic::amdgcn_icmp; + if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || + CCVal > CmpInst::LAST_ICMP_PREDICATE)) || + (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || + CCVal > CmpInst::LAST_FCMP_PREDICATE))) + break; + + Value *Src0 = II.getArgOperand(0); + Value *Src1 = II.getArgOperand(1); + + if (auto *CSrc0 = dyn_cast(Src0)) { + if (auto *CSrc1 = dyn_cast(Src1)) { + Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); + if (CCmp->isNullValue()) { + return IC.replaceInstUsesWith( + II, ConstantExpr::getSExt(CCmp, II.getType())); + } + + // The result of V_ICMP/V_FCMP assembly instructions (which this + // intrinsic exposes) is one bit per thread, masked with the EXEC + // register (which contains the bitmask of live threads). So a + // comparison that always returns true is the same as a read of the + // EXEC register. + Function *NewF = Intrinsic::getDeclaration( + II.getModule(), Intrinsic::read_register, II.getType()); + Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; + MDNode *MD = MDNode::get(II.getContext(), MDArgs); + Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; + CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); + NewCall->addAttribute(AttributeList::FunctionIndex, + Attribute::Convergent); + NewCall->takeName(&II); + return IC.replaceInstUsesWith(II, NewCall); + } + + // Canonicalize constants to RHS. + CmpInst::Predicate SwapPred = + CmpInst::getSwappedPredicate(static_cast(CCVal)); + II.setArgOperand(0, Src1); + II.setArgOperand(1, Src0); + II.setArgOperand( + 2, ConstantInt::get(CC->getType(), static_cast(SwapPred))); + return &II; + } + + if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) + break; + + // Canonicalize compare eq with true value to compare != 0 + // llvm.amdgcn.icmp(zext (i1 x), 1, eq) + // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) + // llvm.amdgcn.icmp(sext (i1 x), -1, eq) + // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) + Value *ExtSrc; + if (CCVal == CmpInst::ICMP_EQ && + ((match(Src1, PatternMatch::m_One()) && + match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || + (match(Src1, PatternMatch::m_AllOnes()) && + match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && + ExtSrc->getType()->isIntegerTy(1)) { + IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); + IC.replaceOperand(II, 2, + ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); + return &II; + } + + CmpInst::Predicate SrcPred; + Value *SrcLHS; + Value *SrcRHS; + + // Fold compare eq/ne with 0 from a compare result as the predicate to the + // intrinsic. The typical use is a wave vote function in the library, which + // will be fed from a user code condition compared with 0. Fold in the + // redundant compare. + + // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) + // -> llvm.amdgcn.[if]cmp(a, b, pred) + // + // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) + // -> llvm.amdgcn.[if]cmp(a, b, inv pred) + if (match(Src1, PatternMatch::m_Zero()) && + match(Src0, PatternMatch::m_ZExtOrSExt( + m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), + PatternMatch::m_Value(SrcRHS))))) { + if (CCVal == CmpInst::ICMP_EQ) + SrcPred = CmpInst::getInversePredicate(SrcPred); + + Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) + ? Intrinsic::amdgcn_fcmp + : Intrinsic::amdgcn_icmp; + + Type *Ty = SrcLHS->getType(); + if (auto *CmpType = dyn_cast(Ty)) { + // Promote to next legal integer type. + unsigned Width = CmpType->getBitWidth(); + unsigned NewWidth = Width; + + // Don't do anything for i1 comparisons. + if (Width == 1) + break; + + if (Width <= 16) + NewWidth = 16; + else if (Width <= 32) + NewWidth = 32; + else if (Width <= 64) + NewWidth = 64; + else if (Width > 64) + break; // Can't handle this. + + if (Width != NewWidth) { + IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); + if (CmpInst::isSigned(SrcPred)) { + SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); + SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); + } else { + SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); + SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); + } + } + } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) + break; + + Function *NewF = Intrinsic::getDeclaration( + II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); + Value *Args[] = {SrcLHS, SrcRHS, + ConstantInt::get(CC->getType(), SrcPred)}; + CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); + NewCall->takeName(&II); + return IC.replaceInstUsesWith(II, NewCall); + } + + break; + } + case Intrinsic::amdgcn_ballot: { + if (auto *Src = dyn_cast(II.getArgOperand(0))) { + if (Src->isZero()) { + // amdgcn.ballot(i1 0) is zero. + return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); + } + + if (Src->isOne()) { + // amdgcn.ballot(i1 1) is exec. + const char *RegName = "exec"; + if (II.getType()->isIntegerTy(32)) + RegName = "exec_lo"; + else if (!II.getType()->isIntegerTy(64)) + break; + + Function *NewF = Intrinsic::getDeclaration( + II.getModule(), Intrinsic::read_register, II.getType()); + Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; + MDNode *MD = MDNode::get(II.getContext(), MDArgs); + Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; + CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); + NewCall->addAttribute(AttributeList::FunctionIndex, + Attribute::Convergent); + NewCall->takeName(&II); + return IC.replaceInstUsesWith(II, NewCall); + } + } + break; + } + case Intrinsic::amdgcn_wqm_vote: { + // wqm_vote is identity when the argument is constant. + if (!isa(II.getArgOperand(0))) + break; + + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + case Intrinsic::amdgcn_kill: { + const ConstantInt *C = dyn_cast(II.getArgOperand(0)); + if (!C || !C->getZExtValue()) + break; + + // amdgcn.kill(i1 1) is a no-op + return IC.eraseInstFromFunction(II); + } + case Intrinsic::amdgcn_update_dpp: { + Value *Old = II.getArgOperand(0); + + auto *BC = cast(II.getArgOperand(5)); + auto *RM = cast(II.getArgOperand(3)); + auto *BM = cast(II.getArgOperand(4)); + if (BC->isZeroValue() || RM->getZExtValue() != 0xF || + BM->getZExtValue() != 0xF || isa(Old)) + break; + + // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. + return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); + } + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: { + // Discard vdst_in if it's not going to be read. + Value *VDstIn = II.getArgOperand(0); + if (isa(VDstIn)) + break; + + ConstantInt *FetchInvalid = cast(II.getArgOperand(4)); + ConstantInt *BoundCtrl = cast(II.getArgOperand(5)); + if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) + break; + + return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); + } + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: { + // A constant value is trivially uniform. + if (Constant *C = dyn_cast(II.getArgOperand(0))) { + return IC.replaceInstUsesWith(II, C); + } + + // The rest of these may not be safe if the exec may not be the same between + // the def and use. + Value *Src = II.getArgOperand(0); + Instruction *SrcInst = dyn_cast(Src); + if (SrcInst && SrcInst->getParent() != II.getParent()) + break; + + // readfirstlane (readfirstlane x) -> readfirstlane x + // readlane (readfirstlane x), y -> readfirstlane x + if (match(Src, + PatternMatch::m_Intrinsic())) { + return IC.replaceInstUsesWith(II, Src); + } + + if (IID == Intrinsic::amdgcn_readfirstlane) { + // readfirstlane (readlane x, y) -> readlane x, y + if (match(Src, PatternMatch::m_Intrinsic())) { + return IC.replaceInstUsesWith(II, Src); + } + } else { + // readlane (readlane x, y), y -> readlane x, y + if (match(Src, PatternMatch::m_Intrinsic( + PatternMatch::m_Value(), + PatternMatch::m_Specific(II.getArgOperand(1))))) { + return IC.replaceInstUsesWith(II, Src); + } + } + + break; + } + case Intrinsic::amdgcn_ldexp: { + // FIXME: This doesn't introduce new instructions and belongs in + // InstructionSimplify. + Type *Ty = II.getType(); + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + + // Folding undef to qnan is safe regardless of the FP mode. + if (isa(Op0)) { + auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); + return IC.replaceInstUsesWith(II, QNaN); + } + + const APFloat *C = nullptr; + match(Op0, PatternMatch::m_APFloat(C)); + + // FIXME: Should flush denorms depending on FP mode, but that's ignored + // everywhere else. + // + // These cases should be safe, even with strictfp. + // ldexp(0.0, x) -> 0.0 + // ldexp(-0.0, x) -> -0.0 + // ldexp(inf, x) -> inf + // ldexp(-inf, x) -> -inf + if (C && (C->isZero() || C->isInfinity())) { + return IC.replaceInstUsesWith(II, Op0); + } + + // With strictfp, be more careful about possibly needing to flush denormals + // or not, and snan behavior depends on ieee_mode. + if (II.isStrictFP()) + break; + + if (C && C->isNaN()) { + // FIXME: We just need to make the nan quiet here, but that's unavailable + // on APFloat, only IEEEfloat + auto *Quieted = + ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); + return IC.replaceInstUsesWith(II, Quieted); + } + + // ldexp(x, 0) -> x + // ldexp(x, undef) -> x + if (isa(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { + return IC.replaceInstUsesWith(II, Op0); + } + + break; + } + } + return None; +} + +/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. +/// +/// Note: This only supports non-TFE/LWE image intrinsic calls; those have +/// struct returns. +Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, + IntrinsicInst &II, + APInt DemandedElts, + int DMaskIdx = -1) { + + // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully + // supported. + if (DMaskIdx < 0 && II.getType()->getScalarSizeInBits() != 32 && + DemandedElts.getActiveBits() == 3) + return nullptr; + + auto *IIVTy = cast(II.getType()); + unsigned VWidth = IIVTy->getNumElements(); + if (VWidth == 1) + return nullptr; + + IRBuilderBase::InsertPointGuard Guard(IC.Builder); + IC.Builder.SetInsertPoint(&II); + + // Assume the arguments are unchanged and later override them, if needed. + SmallVector Args(II.arg_begin(), II.arg_end()); + + if (DMaskIdx < 0) { + // Buffer case. + + const unsigned ActiveBits = DemandedElts.getActiveBits(); + const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); + + // Start assuming the prefix of elements is demanded, but possibly clear + // some other bits if there are trailing zeros (unused components at front) + // and update offset. + DemandedElts = (1 << ActiveBits) - 1; + + if (UnusedComponentsAtFront > 0) { + static const unsigned InvalidOffsetIdx = 0xf; + + unsigned OffsetIdx; + switch (II.getIntrinsicID()) { + case Intrinsic::amdgcn_raw_buffer_load: + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_s_buffer_load: + // If resulting type is vec3, there is no point in trimming the + // load with updated offset, as the vec3 would most likely be widened to + // vec4 anyway during lowering. + if (ActiveBits == 4 && UnusedComponentsAtFront == 1) + OffsetIdx = InvalidOffsetIdx; + else + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_struct_buffer_load: + OffsetIdx = 2; + break; + default: + // TODO: handle tbuffer* intrinsics. + OffsetIdx = InvalidOffsetIdx; + break; + } + + if (OffsetIdx != InvalidOffsetIdx) { + // Clear demanded bits and update the offset. + DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); + auto *Offset = II.getArgOperand(OffsetIdx); + unsigned SingleComponentSizeInBits = + IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); + unsigned OffsetAdd = + UnusedComponentsAtFront * SingleComponentSizeInBits / 8; + auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); + Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); + } + } + } else { + // Image case. + + ConstantInt *DMask = cast(II.getArgOperand(DMaskIdx)); + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + + // Mask off values that are undefined because the dmask doesn't cover them + DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; + + unsigned NewDMaskVal = 0; + unsigned OrigLoadIdx = 0; + for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { + const unsigned Bit = 1 << SrcIdx; + if (!!(DMaskVal & Bit)) { + if (!!DemandedElts[OrigLoadIdx]) + NewDMaskVal |= Bit; + OrigLoadIdx++; + } + } + + if (DMaskVal != NewDMaskVal) + Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); + } + + unsigned NewNumElts = DemandedElts.countPopulation(); + if (!NewNumElts) + return UndefValue::get(II.getType()); + + if (NewNumElts >= VWidth && DemandedElts.isMask()) { + if (DMaskIdx >= 0) + II.setArgOperand(DMaskIdx, Args[DMaskIdx]); + return nullptr; + } + + // Validate function argument and return types, extracting overloaded types + // along the way. + SmallVector OverloadTys; + if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) + return nullptr; + + Module *M = II.getParent()->getParent()->getParent(); + Type *EltTy = IIVTy->getElementType(); + Type *NewTy = + (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); + + OverloadTys[0] = NewTy; + Function *NewIntrin = + Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); + + CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); + NewCall->takeName(&II); + NewCall->copyMetadata(II); + + if (NewNumElts == 1) { + return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), + NewCall, + DemandedElts.countTrailingZeros()); + } + + SmallVector EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!DemandedElts[OrigLoadIdx]) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } + + Value *Shuffle = + IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); + + return Shuffle; +} + +Optional GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) const { + switch (II.getIntrinsicID()) { + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_s_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_tbuffer_load: + return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); + default: { + if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { + return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); + } + break; + } + } + return None; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -33,6 +33,7 @@ namespace llvm { class AMDGPUTargetLowering; +class InstCombiner; class Loop; class ScalarEvolution; class Type; @@ -223,6 +224,14 @@ Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const; + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + Optional simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) const; + unsigned getVectorSplitCost() { return 0; } unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -34,6 +34,10 @@ add_public_tablegen_target(AMDGPUCommonTableGen) +set(LLVM_TARGET_DEFINITIONS InstCombineTables.td) +tablegen(LLVM InstCombineTables.inc -gen-searchable-tables) +add_public_tablegen_target(InstCombineTableGen) + add_llvm_target(AMDGPUCodeGen AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp @@ -48,6 +52,7 @@ AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp AMDGPUHSAMetadataStreamer.cpp + AMDGPUInstCombineIntrinsic.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp AMDGPUISelDAGToDAG.cpp diff --git a/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/llvm/lib/Target/AMDGPU/InstCombineTables.td rename from llvm/lib/Transforms/InstCombine/InstCombineTables.td rename to llvm/lib/Target/AMDGPU/InstCombineTables.td diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -113,6 +113,9 @@ return !ST->isTargetDarwin() && !ST->hasMVEFloatOps(); } + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -28,6 +28,8 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include #include @@ -50,6 +52,28 @@ extern cl::opt EnableMaskedGatherScatters; +/// Convert a vector load intrinsic into a simple llvm load instruction. +/// This is beneficial when the underlying object being addressed comes +/// from a constant, since we get constant-folding for free. +static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, + InstCombiner::BuilderTy &Builder) { + auto *IntrAlign = dyn_cast(II.getArgOperand(1)); + + if (!IntrAlign) + return nullptr; + + unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign + ? MemAlign + : IntrAlign->getLimitedValue(); + + if (!isPowerOf2_32(Alignment)) + return nullptr; + + auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), + PointerType::get(II.getType(), 0)); + return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); +} + bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -82,6 +106,114 @@ return false; } +Optional +ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::arm_neon_vld1: { + Align MemAlign = + getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()); + if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + } + + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + Align MemAlign = + getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()); + unsigned AlignArg = II.getNumArgOperands() - 1; + Value *AlignArgOp = II.getArgOperand(AlignArg); + MaybeAlign Align = cast(AlignArgOp)->getMaybeAlignValue(); + if (Align && *Align < MemAlign) { + return IC.replaceOperand( + II, AlignArg, + ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(), + false)); + } + break; + } + + case Intrinsic::arm_mve_pred_i2v: { + Value *Arg = II.getArgOperand(0); + Value *ArgArg; + if (match(Arg, PatternMatch::m_Intrinsic( + PatternMatch::m_Value(ArgArg))) && + II.getType() == ArgArg->getType()) { + return IC.replaceInstUsesWith(II, ArgArg); + } + Constant *XorMask; + if (match(Arg, m_Xor(PatternMatch::m_Intrinsic( + PatternMatch::m_Value(ArgArg)), + PatternMatch::m_Constant(XorMask))) && + II.getType() == ArgArg->getType()) { + if (auto *CI = dyn_cast(XorMask)) { + if (CI->getValue().trunc(16).isAllOnesValue()) { + auto TrueVector = IC.Builder.CreateVectorSplat( + cast(II.getType())->getNumElements(), + IC.Builder.getTrue()); + return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); + } + } + } + KnownBits ScalarKnown(32); + if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16), + ScalarKnown, 0)) { + return &II; + } + break; + } + case Intrinsic::arm_mve_pred_v2i: { + Value *Arg = II.getArgOperand(0); + Value *ArgArg; + if (match(Arg, PatternMatch::m_Intrinsic( + PatternMatch::m_Value(ArgArg)))) { + return IC.replaceInstUsesWith(II, ArgArg); + } + if (!II.getMetadata(LLVMContext::MD_range)) { + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + Metadata *M[] = { + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))}; + II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M)); + return &II; + } + break; + } + case Intrinsic::arm_mve_vadc: + case Intrinsic::arm_mve_vadc_predicated: { + unsigned CarryOp = + (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; + assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && + "Bad type for intrinsic!"); + + KnownBits CarryKnown(32); + if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29), + CarryKnown)) { + return &II; + } + break; + } + } + return None; +} + int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -48,6 +48,9 @@ return AddressSpace::ADDRESS_SPACE_GENERIC; } + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + // Loads and stores can be vectorized if the alignment is at least as big as // the load/store we want to vectorize. bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -111,6 +111,263 @@ return false; } +// Convert NVVM intrinsics to target-generic LLVM code where possible. +static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { + // Each NVVM intrinsic we can simplify can be replaced with one of: + // + // * an LLVM intrinsic, + // * an LLVM cast operation, + // * an LLVM binary operation, or + // * ad-hoc LLVM IR for the particular operation. + + // Some transformations are only valid when the module's + // flush-denormals-to-zero (ftz) setting is true/false, whereas other + // transformations are valid regardless of the module's ftz setting. + enum FtzRequirementTy { + FTZ_Any, // Any ftz setting is ok. + FTZ_MustBeOn, // Transformation is valid only if ftz is on. + FTZ_MustBeOff, // Transformation is valid only if ftz is off. + }; + // Classes of NVVM intrinsics that can't be replaced one-to-one with a + // target-generic intrinsic, cast op, or binary op but that we can nonetheless + // simplify. + enum SpecialCase { + SPC_Reciprocal, + }; + + // SimplifyAction is a poor-man's variant (plus an additional flag) that + // represents how to replace an NVVM intrinsic with target-generic LLVM IR. + struct SimplifyAction { + // Invariant: At most one of these Optionals has a value. + Optional IID; + Optional CastOp; + Optional BinaryOp; + Optional Special; + + FtzRequirementTy FtzRequirement = FTZ_Any; + + SimplifyAction() = default; + + SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) + : IID(IID), FtzRequirement(FtzReq) {} + + // Cast operations don't have anything to do with FTZ, so we skip that + // argument. + SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} + + SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) + : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} + + SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) + : Special(Special), FtzRequirement(FtzReq) {} + }; + + // Try to generate a SimplifyAction describing how to replace our + // IntrinsicInstr with target-generic LLVM IR. + const SimplifyAction Action = [II]() -> SimplifyAction { + switch (II->getIntrinsicID()) { + // NVVM intrinsics that map directly to LLVM intrinsics. + case Intrinsic::nvvm_ceil_d: + return {Intrinsic::ceil, FTZ_Any}; + case Intrinsic::nvvm_ceil_f: + return {Intrinsic::ceil, FTZ_MustBeOff}; + case Intrinsic::nvvm_ceil_ftz_f: + return {Intrinsic::ceil, FTZ_MustBeOn}; + case Intrinsic::nvvm_fabs_d: + return {Intrinsic::fabs, FTZ_Any}; + case Intrinsic::nvvm_fabs_f: + return {Intrinsic::fabs, FTZ_MustBeOff}; + case Intrinsic::nvvm_fabs_ftz_f: + return {Intrinsic::fabs, FTZ_MustBeOn}; + case Intrinsic::nvvm_floor_d: + return {Intrinsic::floor, FTZ_Any}; + case Intrinsic::nvvm_floor_f: + return {Intrinsic::floor, FTZ_MustBeOff}; + case Intrinsic::nvvm_floor_ftz_f: + return {Intrinsic::floor, FTZ_MustBeOn}; + case Intrinsic::nvvm_fma_rn_d: + return {Intrinsic::fma, FTZ_Any}; + case Intrinsic::nvvm_fma_rn_f: + return {Intrinsic::fma, FTZ_MustBeOff}; + case Intrinsic::nvvm_fma_rn_ftz_f: + return {Intrinsic::fma, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmax_d: + return {Intrinsic::maxnum, FTZ_Any}; + case Intrinsic::nvvm_fmax_f: + return {Intrinsic::maxnum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmax_ftz_f: + return {Intrinsic::maxnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_fmin_d: + return {Intrinsic::minnum, FTZ_Any}; + case Intrinsic::nvvm_fmin_f: + return {Intrinsic::minnum, FTZ_MustBeOff}; + case Intrinsic::nvvm_fmin_ftz_f: + return {Intrinsic::minnum, FTZ_MustBeOn}; + case Intrinsic::nvvm_round_d: + return {Intrinsic::round, FTZ_Any}; + case Intrinsic::nvvm_round_f: + return {Intrinsic::round, FTZ_MustBeOff}; + case Intrinsic::nvvm_round_ftz_f: + return {Intrinsic::round, FTZ_MustBeOn}; + case Intrinsic::nvvm_sqrt_rn_d: + return {Intrinsic::sqrt, FTZ_Any}; + case Intrinsic::nvvm_sqrt_f: + // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the + // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts + // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are + // the versions with explicit ftz-ness. + return {Intrinsic::sqrt, FTZ_Any}; + case Intrinsic::nvvm_sqrt_rn_f: + return {Intrinsic::sqrt, FTZ_MustBeOff}; + case Intrinsic::nvvm_sqrt_rn_ftz_f: + return {Intrinsic::sqrt, FTZ_MustBeOn}; + case Intrinsic::nvvm_trunc_d: + return {Intrinsic::trunc, FTZ_Any}; + case Intrinsic::nvvm_trunc_f: + return {Intrinsic::trunc, FTZ_MustBeOff}; + case Intrinsic::nvvm_trunc_ftz_f: + return {Intrinsic::trunc, FTZ_MustBeOn}; + + // NVVM intrinsics that map to LLVM cast operations. + // + // Note that llvm's target-generic conversion operators correspond to the rz + // (round to zero) versions of the nvvm conversion intrinsics, even though + // most everything else here uses the rn (round to nearest even) nvvm ops. + case Intrinsic::nvvm_d2i_rz: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_d2ll_rz: + case Intrinsic::nvvm_f2ll_rz: + return {Instruction::FPToSI}; + case Intrinsic::nvvm_d2ui_rz: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_d2ull_rz: + case Intrinsic::nvvm_f2ull_rz: + return {Instruction::FPToUI}; + case Intrinsic::nvvm_i2d_rz: + case Intrinsic::nvvm_i2f_rz: + case Intrinsic::nvvm_ll2d_rz: + case Intrinsic::nvvm_ll2f_rz: + return {Instruction::SIToFP}; + case Intrinsic::nvvm_ui2d_rz: + case Intrinsic::nvvm_ui2f_rz: + case Intrinsic::nvvm_ull2d_rz: + case Intrinsic::nvvm_ull2f_rz: + return {Instruction::UIToFP}; + + // NVVM intrinsics that map to LLVM binary ops. + case Intrinsic::nvvm_add_rn_d: + return {Instruction::FAdd, FTZ_Any}; + case Intrinsic::nvvm_add_rn_f: + return {Instruction::FAdd, FTZ_MustBeOff}; + case Intrinsic::nvvm_add_rn_ftz_f: + return {Instruction::FAdd, FTZ_MustBeOn}; + case Intrinsic::nvvm_mul_rn_d: + return {Instruction::FMul, FTZ_Any}; + case Intrinsic::nvvm_mul_rn_f: + return {Instruction::FMul, FTZ_MustBeOff}; + case Intrinsic::nvvm_mul_rn_ftz_f: + return {Instruction::FMul, FTZ_MustBeOn}; + case Intrinsic::nvvm_div_rn_d: + return {Instruction::FDiv, FTZ_Any}; + case Intrinsic::nvvm_div_rn_f: + return {Instruction::FDiv, FTZ_MustBeOff}; + case Intrinsic::nvvm_div_rn_ftz_f: + return {Instruction::FDiv, FTZ_MustBeOn}; + + // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but + // need special handling. + // + // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just + // as well. + case Intrinsic::nvvm_rcp_rn_d: + return {SPC_Reciprocal, FTZ_Any}; + case Intrinsic::nvvm_rcp_rn_f: + return {SPC_Reciprocal, FTZ_MustBeOff}; + case Intrinsic::nvvm_rcp_rn_ftz_f: + return {SPC_Reciprocal, FTZ_MustBeOn}; + + // We do not currently simplify intrinsics that give an approximate + // answer. These include: + // + // - nvvm_cos_approx_{f,ftz_f} + // - nvvm_ex2_approx_{d,f,ftz_f} + // - nvvm_lg2_approx_{d,f,ftz_f} + // - nvvm_sin_approx_{f,ftz_f} + // - nvvm_sqrt_approx_{f,ftz_f} + // - nvvm_rsqrt_approx_{d,f,ftz_f} + // - nvvm_div_approx_{ftz_d,ftz_f,f} + // - nvvm_rcp_approx_ftz_d + // + // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" + // means that fastmath is enabled in the intrinsic. Unfortunately only + // binary operators (currently) have a fastmath bit in SelectionDAG, so + // this information gets lost and we can't select on it. + // + // TODO: div and rcp are lowered to a binary op, so these we could in + // theory lower them to "fast fdiv". + + default: + return {}; + } + }(); + + // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we + // can bail out now. (Notice that in the case that IID is not an NVVM + // intrinsic, we don't have to look up any module metadata, as + // FtzRequirementTy will be FTZ_Any.) + if (Action.FtzRequirement != FTZ_Any) { + StringRef Attr = II->getFunction() + ->getFnAttribute("denormal-fp-math-f32") + .getValueAsString(); + DenormalMode Mode = parseDenormalFPAttribute(Attr); + bool FtzEnabled = Mode.Output != DenormalMode::IEEE; + + if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) + return nullptr; + } + + // Simplify to target-generic intrinsic. + if (Action.IID) { + SmallVector Args(II->arg_operands()); + // All the target-generic intrinsics currently of interest to us have one + // type argument, equal to that of the nvvm intrinsic's argument. + Type *Tys[] = {II->getArgOperand(0)->getType()}; + return CallInst::Create( + Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); + } + + // Simplify to target-generic binary op. + if (Action.BinaryOp) + return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), + II->getArgOperand(1), II->getName()); + + // Simplify to target-generic cast op. + if (Action.CastOp) + return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), + II->getName()); + + // All that's left are the special cases. + if (!Action.Special) + return nullptr; + + switch (*Action.Special) { + case SPC_Reciprocal: + // Simplify reciprocal. + return BinaryOperator::Create( + Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), + II->getArgOperand(0), II->getName()); + } + llvm_unreachable("All SpecialCase enumerators should be handled in switch."); +} + +Optional +NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) { + return I; + } + return None; +} + int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -41,6 +41,9 @@ : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -13,8 +13,11 @@ #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "ppctti" @@ -59,6 +62,158 @@ return TTI::PSK_Software; } +Optional +PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + // Turn PPC lvx -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(0), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Value *Ptr = IC.Builder.CreateBitCast( + II.getArgOperand(0), PointerType::getUnqual(II.getType())); + return new LoadInst(II.getType(), Ptr, "", false, Align(16)); + } + break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: { + // Turn PPC VSX loads into normal loads. + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0), + PointerType::getUnqual(II.getType())); + return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1)); + } + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + // Turn stvx -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(1), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16)); + } + break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: { + // Turn PPC VSX stores into normal stores. + Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1)); + } + case Intrinsic::ppc_qpx_qvlfs: + // Turn PPC QPX qvlfs -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(0), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Type *VTy = + VectorType::get(IC.Builder.getFloatTy(), + cast(II.getType())->getElementCount()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0), + PointerType::getUnqual(VTy)); + Value *Load = IC.Builder.CreateLoad(VTy, Ptr); + return new FPExtInst(Load, II.getType()); + } + break; + case Intrinsic::ppc_qpx_qvlfd: + // Turn PPC QPX qvlfd -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(0), Align(32), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { + Value *Ptr = IC.Builder.CreateBitCast( + II.getArgOperand(0), PointerType::getUnqual(II.getType())); + return new LoadInst(II.getType(), Ptr, "", false, Align(32)); + } + break; + case Intrinsic::ppc_qpx_qvstfs: + // Turn PPC QPX qvstfs -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(1), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Type *VTy = VectorType::get( + IC.Builder.getFloatTy(), + cast(II.getArgOperand(0)->getType())->getElementCount()); + Value *TOp = IC.Builder.CreateFPTrunc(II.getArgOperand(0), VTy); + Type *OpPtrTy = PointerType::getUnqual(VTy); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(TOp, Ptr, false, Align(16)); + } + break; + case Intrinsic::ppc_qpx_qvstfd: + // Turn PPC QPX qvstfd -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(1), Align(32), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) { + Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(II.getArgOperand(0), Ptr, false, Align(32)); + } + break; + + case Intrinsic::ppc_altivec_vperm: + // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. + // Note that ppc_altivec_vperm has a big-endian bias, so when creating + // a vectorshuffle for little endian, we must undo the transformation + // performed on vec_perm in altivec.h. That is, we must complement + // the permutation mask with respect to 31 and reverse the order of + // V1 and V2. + if (Constant *Mask = dyn_cast(II.getArgOperand(2))) { + assert(cast(Mask->getType())->getNumElements() == 16 && + "Bad type for intrinsic!"); + + // Check that all of the elements are integer constants or undefs. + bool AllEltsOk = true; + for (unsigned i = 0; i != 16; ++i) { + Constant *Elt = Mask->getAggregateElement(i); + if (!Elt || !(isa(Elt) || isa(Elt))) { + AllEltsOk = false; + break; + } + } + + if (AllEltsOk) { + // Cast the input vectors to byte vectors. + Value *Op0 = + IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType()); + Value *Op1 = + IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType()); + Value *Result = UndefValue::get(Op0->getType()); + + // Only extract each element once. + Value *ExtractedElts[32]; + memset(ExtractedElts, 0, sizeof(ExtractedElts)); + + for (unsigned i = 0; i != 16; ++i) { + if (isa(Mask->getAggregateElement(i))) + continue; + unsigned Idx = + cast(Mask->getAggregateElement(i))->getZExtValue(); + Idx &= 31; // Match the hardware behavior. + if (DL.isLittleEndian()) + Idx = 31 - Idx; + + if (!ExtractedElts[Idx]) { + Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; + Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; + ExtractedElts[Idx] = IC.Builder.CreateExtractElement( + Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15)); + } + + // Insert this value into the result vector. + Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx], + IC.Builder.getInt32(i)); + } + return CastInst::Create(Instruction::BitCast, Result, II.getType()); + } + } + break; + } + return None; +} + int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { if (DisablePPCConstHoist) @@ -849,7 +1004,7 @@ // The cost of the load constant for a vector extract is disregarded // (invariant, easily schedulable). return vectorCostAdjustment(1, Opcode, Val, nullptr); - + } else if (ST->hasDirectMove()) // Assume permute has standard cost. // Assume move-to/move-from VSR have 2x standard cost. diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -47,6 +47,7 @@ X86IndirectThunks.cpp X86InterleavedAccess.cpp X86InsertPrefetch.cpp + X86InstCombineIntrinsic.cpp X86InstrFMA3Info.cpp X86InstrFoldTables.cpp X86InstrInfo.cpp diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -0,0 +1,2007 @@ +//===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// X86 target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "X86TargetTransformInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86tti" + +/// Return a constant boolean vector that has true elements in all positions +/// where the input constant data vector has an element with the sign bit set. +static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { + SmallVector BoolVec; + IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); + for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { + Constant *Elt = V->getElementAsConstant(I); + assert((isa(Elt) || isa(Elt)) && + "Unexpected constant data vector element type"); + bool Sign = V->getElementType()->isIntegerTy() + ? cast(Elt)->isNegative() + : cast(Elt)->isNegative(); + BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); + } + return ConstantVector::get(BoolVec); +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Constant *ZeroVec = Constant::getNullValue(II.getType()); + + // Special case a zero mask since that's not a ConstantDataVector. + // This masked load instruction creates a zero vector. + if (isa(Mask)) + return IC.replaceInstUsesWith(II, ZeroVec); + + auto *ConstMask = dyn_cast(Mask); + if (!ConstMask) + return nullptr; + + // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic + // to allow target-independent optimizations. + + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // Second, convert the x86 XMM integer vector mask to a vector of bools based + // on each element's most significant bit (the sign bit). + Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); + + // The pass-through vector for an x86 masked load is a zero vector. + CallInst *NewMaskedLoad = + IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); + return IC.replaceInstUsesWith(II, NewMaskedLoad); +} + +// TODO: If the x86 backend knew how to convert a bool vector mask back to an +// XMM register mask efficiently, we could transform all x86 masked intrinsics +// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. +static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { + Value *Ptr = II.getOperand(0); + Value *Mask = II.getOperand(1); + Value *Vec = II.getOperand(2); + + // Special case a zero mask since that's not a ConstantDataVector: + // this masked store instruction does nothing. + if (isa(Mask)) { + IC.eraseInstFromFunction(II); + return true; + } + + // The SSE2 version is too weird (eg, unaligned but non-temporal) to do + // anything else at this level. + if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) + return false; + + auto *ConstMask = dyn_cast(Mask); + if (!ConstMask) + return false; + + // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic + // to allow target-independent optimizations. + + // First, cast the x86 intrinsic scalar pointer to a vector pointer to match + // the LLVM intrinsic definition for the pointer argument. + unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); + PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); + Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); + + // Second, convert the x86 XMM integer vector mask to a vector of bools based + // on each element's most significant bit (the sign bit). + Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); + + IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); + + // 'Replace uses' doesn't work for stores. Erase the original masked store. + IC.eraseInstFromFunction(II); + return true; +} + +static Value *simplifyX86immShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + bool IsImm = false; + + switch (II.getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + LogicalShift = false; + ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + LogicalShift = true; + ShiftLeft = false; + break; + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + LogicalShift = true; + ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); + auto VT = cast(Vec->getType()); + auto SVT = VT->getElementType(); + auto AmtVT = Amt->getType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. If its guaranteed to be out of range, logical shifts combine + // to zero and arithmetic shifts are clamped to (BitWidth - 1). + if (IsImm) { + assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); + KnownBits KnownAmtBits = + llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); + if (KnownAmtBits.getMaxValue().ult(BitWidth)) { + Amt = Builder.CreateZExtOrTrunc(Amt, SVT); + Amt = Builder.CreateVectorSplat(VWidth, Amt); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + if (KnownAmtBits.getMinValue().uge(BitWidth)) { + if (LogicalShift) + return ConstantAggregateZero::get(VT); + Amt = ConstantInt::get(SVT, BitWidth - 1); + return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); + } + } else { + // Ensure the first element has an in-range value and the rest of the + // elements in the bottom 64 bits are zero. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + unsigned NumAmtElts = cast(AmtVT)->getNumElements(); + APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); + APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); + KnownBits KnownLowerBits = llvm::computeKnownBits( + Amt, DemandedLower, II.getModule()->getDataLayout()); + KnownBits KnownUpperBits = llvm::computeKnownBits( + Amt, DemandedUpper, II.getModule()->getDataLayout()); + if (KnownLowerBits.getMaxValue().ult(BitWidth) && + (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { + SmallVector ZeroSplat(VWidth, 0); + Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + } + + // Simplify if count is constant vector. + auto CDV = dyn_cast(Amt); + if (!CDV) + return nullptr; + + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + + // Concatenate the sub-elements to create the 64-bit value. + APInt Count(64, 0); + for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast(CDV->getElementAsConstant(SubEltIdx)); + Count <<= BitWidth; + Count |= SubElt->getValue().zextOrTrunc(64); + } + + // If shift-by-zero then just return the original value. + if (Count.isNullValue()) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } + + // Get a constant vector of the same type as the first operand. + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. +// Unlike the generic IR shifts, the intrinsics have defined behaviour for out +// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). +static Value *simplifyX86varShift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + LogicalShift = false; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + LogicalShift = true; + ShiftLeft = false; + break; + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + LogicalShift = true; + ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); + auto VT = cast(II.getType()); + auto SVT = VT->getElementType(); + int NumElts = VT->getNumElements(); + int BitWidth = SVT->getIntegerBitWidth(); + + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. + APInt UpperBits = + APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); + if (llvm::MaskedValueIsZero(Amt, UpperBits, + II.getModule()->getDataLayout())) { + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + + // Simplify if all shift amounts are constant/undef. + auto *CShift = dyn_cast(Amt); + if (!CShift) + return nullptr; + + // Collect each element's shift amount. + // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. + bool AnyOutOfRange = false; + SmallVector ShiftAmts; + for (int I = 0; I < NumElts; ++I) { + auto *CElt = CShift->getAggregateElement(I); + if (CElt && isa(CElt)) { + ShiftAmts.push_back(-1); + continue; + } + + auto *COp = dyn_cast_or_null(CElt); + if (!COp) + return nullptr; + + // Handle out of range shifts. + // If LogicalShift - set to BitWidth (special case). + // If ArithmeticShift - set to (BitWidth - 1) (sign splat). + APInt ShiftVal = COp->getValue(); + if (ShiftVal.uge(BitWidth)) { + AnyOutOfRange = LogicalShift; + ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); + continue; + } + + ShiftAmts.push_back((int)ShiftVal.getZExtValue()); + } + + // If all elements out of range or UNDEF, return vector of zeros/undefs. + // ArithmeticShift should only hit this if they are all UNDEF. + auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; + if (llvm::all_of(ShiftAmts, OutOfRange)) { + SmallVector ConstantVec; + for (int Idx : ShiftAmts) { + if (Idx < 0) { + ConstantVec.push_back(UndefValue::get(SVT)); + } else { + assert(LogicalShift && "Logical shift expected"); + ConstantVec.push_back(ConstantInt::getNullValue(SVT)); + } + } + return ConstantVector::get(ConstantVec); + } + + // We can't handle only some out of range values with generic logical shifts. + if (AnyOutOfRange) + return nullptr; + + // Build the shift amount constant vector. + SmallVector ShiftVecAmts; + for (int Idx : ShiftAmts) { + if (Idx < 0) + ShiftVecAmts.push_back(UndefValue::get(SVT)); + else + ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); + } + auto ShiftVec = ConstantVector::get(ShiftVecAmts); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *simplifyX86pack(IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + + // Fast all undef handling. + if (isa(Arg0) && isa(Arg1)) + return UndefValue::get(ResTy); + + auto *ArgTy = cast(Arg0->getType()); + unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; + unsigned NumSrcElts = ArgTy->getNumElements(); + assert(cast(ResTy)->getNumElements() == (2 * NumSrcElts) && + "Unexpected packing types"); + + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); + unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); + assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && + "Unexpected packing types"); + + // Constant folding. + if (!isa(Arg0) || !isa(Arg1)) + return nullptr; + + // Clamp Values - signed/unsigned both use signed clamp values, but they + // differ on the min/max values. + APInt MinValue, MaxValue; + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + MinValue = + APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + MaxValue = + APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + MinValue = APInt::getNullValue(SrcScalarSizeInBits); + MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); + } + + auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); + auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); + + // Shuffle clamped args together at the lane level. + SmallVector PackMask; + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); + for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) + PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); + } + auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); + + // Truncate to dst size. + return Builder.CreateTrunc(Shuffle, ResTy); +} + +static Value *simplifyX86movmsk(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *Arg = II.getArgOperand(0); + Type *ResTy = II.getType(); + + // movmsk(undef) -> zero as we must ensure the upper bits are zero. + if (isa(Arg)) + return Constant::getNullValue(ResTy); + + auto *ArgTy = dyn_cast(Arg->getType()); + // We can't easily peek through x86_mmx types. + if (!ArgTy) + return nullptr; + + // Expand MOVMSK to compare/bitcast/zext: + // e.g. PMOVMSKB(v16i8 x): + // %cmp = icmp slt <16 x i8> %x, zeroinitializer + // %int = bitcast <16 x i1> %cmp to i16 + // %res = zext i16 %int to i32 + unsigned NumElts = ArgTy->getNumElements(); + Type *IntegerVecTy = VectorType::getInteger(ArgTy); + Type *IntegerTy = Builder.getIntNTy(NumElts); + + Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); + Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); + Res = Builder.CreateBitCast(Res, IntegerTy); + Res = Builder.CreateZExtOrTrunc(Res, ResTy); + return Res; +} + +static Value *simplifyX86addcarry(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Value *CarryIn = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Op2 = II.getArgOperand(2); + Type *RetTy = II.getType(); + Type *OpTy = Op1->getType(); + assert(RetTy->getStructElementType(0)->isIntegerTy(8) && + RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && + "Unexpected types for x86 addcarry"); + + // If carry-in is zero, this is just an unsigned add with overflow. + if (match(CarryIn, PatternMatch::m_ZeroInt())) { + Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, + {Op1, Op2}); + // The types have to be adjusted to match the x86 call types. + Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); + Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), + Builder.getInt8Ty()); + Value *Res = UndefValue::get(RetTy); + Res = Builder.CreateInsertValue(Res, UAddOV, 0); + return Builder.CreateInsertValue(Res, UAddResult, 1); + } + + return nullptr; +} + +static Value *simplifyX86insertps(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *CInt = dyn_cast(II.getArgOperand(2)); + if (!CInt) + return nullptr; + + VectorType *VecTy = cast(II.getType()); + assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); + + // The immediate permute control byte looks like this: + // [3:0] - zero mask for each 32-bit lane + // [5:4] - select one 32-bit destination lane + // [7:6] - select one 32-bit source lane + + uint8_t Imm = CInt->getZExtValue(); + uint8_t ZMask = Imm & 0xf; + uint8_t DestLane = (Imm >> 4) & 0x3; + uint8_t SourceLane = (Imm >> 6) & 0x3; + + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + + // If all zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (ZMask == 0xf) + return ZeroVector; + + // Initialize by passing all of the first source bits through. + int ShuffleMask[4] = {0, 1, 2, 3}; + + // We may replace the second operand with the zero vector. + Value *V1 = II.getArgOperand(1); + + if (ZMask) { + // If the zero mask is being used with a single input or the zero mask + // overrides the destination lane, this is a shuffle with the zero vector. + if ((II.getArgOperand(0) == II.getArgOperand(1)) || + (ZMask & (1 << DestLane))) { + V1 = ZeroVector; + // We may still move 32-bits of the first source vector from one lane + // to another. + ShuffleMask[DestLane] = SourceLane; + // The zero mask may override the previous insert operation. + for (unsigned i = 0; i < 4; ++i) + if ((ZMask >> i) & 0x1) + ShuffleMask[i] = i + 4; + } else { + // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? + return nullptr; + } + } else { + // Replace the selected destination lane with the selected source lane. + ShuffleMask[DestLane] = SourceLane + 4; + } + + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); +} + +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast_or_null(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + auto *ShufTy = FixedVectorType::get(IntTy8, 16); + + SmallVector ShuffleMask; + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back(i + Index); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back(i + 16); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(-1); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ShuffleMask); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt.lshrInPlace(Index); + Elt = Elt.zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->isZero()) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + // From AMD documentation: "The bit index and field length are each six bits + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + auto *ShufTy = FixedVectorType::get(IntTy8, 16); + + SmallVector ShuffleMask; + for (int i = 0; i != (int)Index; ++i) + ShuffleMask.push_back(i); + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back(i + 16); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(i); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(-1); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ShuffleMask); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast(Op0); + Constant *C1 = dyn_cast(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast_or_null(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getModule(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + +/// Attempt to convert pshufb* to shufflevector if the mask is constant. +static Value *simplifyX86pshufb(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && + "Unexpected number of elements in shuffle mask!"); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[64]; + + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa(COp) && !isa(COp))) + return nullptr; + + if (isa(COp)) { + Indexes[I] = -1; + continue; + } + + int8_t Index = cast(COp)->getValue().getZExtValue(); + + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); + Indexes[I] = Index; + } + + auto V1 = II.getArgOperand(0); + auto V2 = Constant::getNullValue(VecTy); + return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); +} + +/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. +static Value *simplifyX86vpermilvar(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + Constant *V = dyn_cast(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast(II.getType()); + unsigned NumElts = VecTy->getNumElements(); + bool IsPD = VecTy->getScalarType()->isDoubleTy(); + unsigned NumLaneElts = IsPD ? 2 : 4; + assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[16]; + + // The intrinsics only read one or two bits, clear the rest. + for (unsigned I = 0; I < NumElts; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa(COp) && !isa(COp))) + return nullptr; + + if (isa(COp)) { + Indexes[I] = -1; + continue; + } + + APInt Index = cast(COp)->getValue(); + Index = Index.zextOrTrunc(32).getLoBits(2); + + // The PD variants uses bit 1 to select per-lane element index, so + // shift down to convert to generic shuffle mask index. + if (IsPD) + Index.lshrInPlace(1); + + // The _256 variants are a bit trickier since the mask bits always index + // into the corresponding 128 half. In order to convert to a generic + // shuffle, we have to make that explicit. + Index += APInt(32, (I / NumLaneElts) * NumLaneElts); + + Indexes[I] = Index.getZExtValue(); + } + + auto V1 = II.getArgOperand(0); + auto V2 = UndefValue::get(V1->getType()); + return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); +} + +/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. +static Value *simplifyX86vpermv(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + auto *V = dyn_cast(II.getArgOperand(1)); + if (!V) + return nullptr; + + auto *VecTy = cast(II.getType()); + unsigned Size = VecTy->getNumElements(); + assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && + "Unexpected shuffle mask size"); + + // Construct a shuffle mask from constant integers or UNDEFs. + int Indexes[64]; + + for (unsigned I = 0; I < Size; ++I) { + Constant *COp = V->getAggregateElement(I); + if (!COp || (!isa(COp) && !isa(COp))) + return nullptr; + + if (isa(COp)) { + Indexes[I] = -1; + continue; + } + + uint32_t Index = cast(COp)->getZExtValue(); + Index &= Size - 1; + Indexes[I] = Index; + } + + auto V1 = II.getArgOperand(0); + auto V2 = UndefValue::get(VecTy); + return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size)); +} + +Optional +X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, + unsigned DemandedWidth) { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + case Intrinsic::x86_bmi_bextr_32: + case Intrinsic::x86_bmi_bextr_64: + case Intrinsic::x86_tbm_bextri_u32: + case Intrinsic::x86_tbm_bextri_u64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast(II.getArgOperand(1))) { + uint64_t Shift = C->getZExtValue(); + uint64_t Length = (Shift >> 8) & 0xff; + Shift &= 0xff; + unsigned BitWidth = II.getType()->getIntegerBitWidth(); + // If the length is 0 or the shift is out of range, replace with zero. + if (Length == 0 || Shift >= BitWidth) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast(II.getArgOperand(0))) { + uint64_t Result = InC->getZExtValue() >> Shift; + if (Length > BitWidth) + Length = BitWidth; + Result &= maskTrailingOnes(Length); + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we + // are only masking bits that a shift already cleared? + } + break; + + case Intrinsic::x86_bmi_bzhi_32: + case Intrinsic::x86_bmi_bzhi_64: + // If the RHS is a constant we can try some simplifications. + if (auto *C = dyn_cast(II.getArgOperand(1))) { + uint64_t Index = C->getZExtValue() & 0xff; + unsigned BitWidth = II.getType()->getIntegerBitWidth(); + if (Index >= BitWidth) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + if (Index == 0) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + // If the LHS is also a constant, we can completely constant fold this. + if (auto *InC = dyn_cast(II.getArgOperand(0))) { + uint64_t Result = InC->getZExtValue(); + Result &= maskTrailingOnes(Index); + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + // TODO should we convert this to an AND if the RHS is constant? + } + break; + case Intrinsic::x86_bmi_pext_32: + case Intrinsic::x86_bmi_pext_64: + if (auto *MaskC = dyn_cast(II.getArgOperand(1))) { + if (MaskC->isNullValue()) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + if (MaskC->isAllOnesValue()) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + + if (auto *SrcC = dyn_cast(II.getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToSet = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToTest = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToSet <<= 1; + // Clear lowest set bit. + Mask &= Mask - 1; + } + + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + } + break; + case Intrinsic::x86_bmi_pdep_32: + case Intrinsic::x86_bmi_pdep_64: + if (auto *MaskC = dyn_cast(II.getArgOperand(1))) { + if (MaskC->isNullValue()) { + return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); + } + if (MaskC->isAllOnesValue()) { + return IC.replaceInstUsesWith(II, II.getArgOperand(0)); + } + + if (auto *SrcC = dyn_cast(II.getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToTest = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToSet = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToTest <<= 1; + // Clear lowest set bit; + Mask &= Mask - 1; + } + + return IC.replaceInstUsesWith(II, + ConstantInt::get(II.getType(), Result)); + } + } + break; + + case Intrinsic::x86_sse_cvtss2si: + case Intrinsic::x86_sse_cvtss2si64: + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvtsd2si: + case Intrinsic::x86_sse2_cvtsd2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + case Intrinsic::x86_avx512_vcvtss2si32: + case Intrinsic::x86_avx512_vcvtss2si64: + case Intrinsic::x86_avx512_vcvtss2usi32: + case Intrinsic::x86_avx512_vcvtss2usi64: + case Intrinsic::x86_avx512_vcvtsd2si32: + case Intrinsic::x86_avx512_vcvtsd2si64: + case Intrinsic::x86_avx512_vcvtsd2usi32: + case Intrinsic::x86_avx512_vcvtsd2usi64: + case Intrinsic::x86_avx512_cvttss2si: + case Intrinsic::x86_avx512_cvttss2si64: + case Intrinsic::x86_avx512_cvttss2usi: + case Intrinsic::x86_avx512_cvttss2usi64: + case Intrinsic::x86_avx512_cvttsd2si: + case Intrinsic::x86_avx512_cvttsd2si64: + case Intrinsic::x86_avx512_cvttsd2usi: + case Intrinsic::x86_avx512_cvttsd2usi64: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + Value *Arg = II.getArgOperand(0); + unsigned VWidth = cast(Arg->getType())->getNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx2_pmovmskb: + if (Value *V = simplifyX86movmsk(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_comieq_sd: + case Intrinsic::x86_sse2_comige_sd: + case Intrinsic::x86_sse2_comigt_sd: + case Intrinsic::x86_sse2_comile_sd: + case Intrinsic::x86_sse2_comilt_sd: + case Intrinsic::x86_sse2_comineq_sd: + case Intrinsic::x86_sse2_ucomieq_sd: + case Intrinsic::x86_sse2_ucomige_sd: + case Intrinsic::x86_sse2_ucomigt_sd: + case Intrinsic::x86_sse2_ucomile_sd: + case Intrinsic::x86_sse2_ucomilt_sd: + case Intrinsic::x86_sse2_ucomineq_sd: + case Intrinsic::x86_avx512_vcomi_ss: + case Intrinsic::x86_avx512_vcomi_sd: + case Intrinsic::x86_avx512_mask_cmp_ss: + case Intrinsic::x86_avx512_mask_cmp_sd: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + bool MadeChange = false; + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + unsigned VWidth = cast(Arg0->getType())->getNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + case Intrinsic::x86_avx512_cmp_pd_128: + case Intrinsic::x86_avx512_cmp_pd_256: + case Intrinsic::x86_avx512_cmp_pd_512: + case Intrinsic::x86_avx512_cmp_ps_128: + case Intrinsic::x86_avx512_cmp_ps_256: + case Intrinsic::x86_avx512_cmp_ps_512: { + // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + bool Arg0IsZero = match(Arg0, PatternMatch::m_PosZeroFP()); + if (Arg0IsZero) + std::swap(Arg0, Arg1); + Value *A, *B; + // This fold requires only the NINF(not +/- inf) since inf minus + // inf is nan. + // NSZ(No Signed Zeros) is not needed because zeros of any sign are + // equal for both compares. + // NNAN is not needed because nans compare the same for both compares. + // The compare intrinsic uses the above assumptions and therefore + // doesn't require additional flags. + if ((match(Arg0, + PatternMatch::m_OneUse(PatternMatch::m_FSub( + PatternMatch::m_Value(A), PatternMatch::m_Value(B)))) && + match(Arg1, PatternMatch::m_PosZeroFP()) && isa(Arg0) && + cast(Arg0)->getFastMathFlags().noInfs())) { + if (Arg0IsZero) + std::swap(A, B); + IC.replaceOperand(II, 0, A); + IC.replaceOperand(II, 1, B); + return &II; + } + break; + } + + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + case Intrinsic::x86_avx512_div_pd_512: + case Intrinsic::x86_avx512_mul_pd_512: + case Intrinsic::x86_avx512_sub_pd_512: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast(II.getArgOperand(2))) { + if (R->getValue() == 4) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + + Value *V; + switch (IID) { + default: + llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_add_ps_512: + case Intrinsic::x86_avx512_add_pd_512: + V = IC.Builder.CreateFAdd(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_sub_ps_512: + case Intrinsic::x86_avx512_sub_pd_512: + V = IC.Builder.CreateFSub(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_mul_ps_512: + case Intrinsic::x86_avx512_mul_pd_512: + V = IC.Builder.CreateFMul(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_div_ps_512: + case Intrinsic::x86_avx512_div_pd_512: + V = IC.Builder.CreateFDiv(Arg0, Arg1); + break; + } + + return IC.replaceInstUsesWith(II, V); + } + } + break; + + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast(II.getArgOperand(4))) { + if (R->getValue() == 4) { + // Extract the element as scalars. + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); + Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); + + Value *V; + switch (IID) { + default: + llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + V = IC.Builder.CreateFAdd(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + V = IC.Builder.CreateFSub(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + V = IC.Builder.CreateFMul(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + V = IC.Builder.CreateFDiv(LHS, RHS); + break; + } + + // Handle the masking aspect of the intrinsic. + Value *Mask = II.getArgOperand(3); + auto *C = dyn_cast(Mask); + // We don't need a select if we know the mask bit is a 1. + if (!C || !C->getValue()[0]) { + // Cast the mask to an i1 vector and then extract the lowest element. + auto *MaskTy = FixedVectorType::get( + IC.Builder.getInt1Ty(), + cast(Mask->getType())->getBitWidth()); + Mask = IC.Builder.CreateBitCast(Mask, MaskTy); + Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); + // Extract the lowest element from the passthru operand. + Value *Passthru = + IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); + V = IC.Builder.CreateSelect(Mask, V, Passthru); + } + + // Insert the result back into the original argument 0. + V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); + + return IC.replaceInstUsesWith(II, V); + } + } + break; + + // Constant fold ashr( , Ci ). + // Constant fold lshr( , Ci ). + // Constant fold shl( , Ci ). + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: + if (Value *V = simplifyX86immShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: { + if (Value *V = simplifyX86immShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + Value *Arg1 = II.getArgOperand(1); + assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = cast(Arg1->getType())->getNumElements(); + + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { + return IC.replaceOperand(II, 1, V); + } + break; + } + + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: + if (Value *V = simplifyX86varShift(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + if (Value *V = simplifyX86pack(II, IC.Builder, true)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: + if (Value *V = simplifyX86pack(II, IC.Builder, false)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_pclmulqdq: + case Intrinsic::x86_pclmulqdq_256: + case Intrinsic::x86_pclmulqdq_512: { + if (auto *C = dyn_cast(II.getArgOperand(2))) { + unsigned Imm = C->getZExtValue(); + + bool MadeChange = false; + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + unsigned VWidth = cast(Arg0->getType())->getNumElements(); + + APInt UndefElts1(VWidth, 0); + APInt DemandedElts1 = + APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); + if (Value *V = + IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + + APInt UndefElts2(VWidth, 0); + APInt DemandedElts2 = + APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); + if (Value *V = + IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + + // If either input elements are undef, the result is zero. + if (DemandedElts1.isSubsetOf(UndefElts1) || + DemandedElts2.isSubsetOf(UndefElts2)) { + return IC.replaceInstUsesWith(II, + ConstantAggregateZero::get(II.getType())); + } + + if (MadeChange) { + return &II; + } + } + break; + } + + case Intrinsic::x86_sse41_insertps: + if (Value *V = simplifyX86insertps(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_sse4a_extrq: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth0 = cast(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast(Op1->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast(Op1); + ConstantInt *CILength = + C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II.getArgOperand(0); + unsigned VWidth = cast(Op0->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast(II.getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast(II.getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth = cast(Op0->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + cast(Op1->getType())->getNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + const APInt &V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + } + + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + return IC.replaceOperand(II, 0, V); + } + break; + } + + case Intrinsic::x86_sse4a_insertqi: { + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + unsigned VWidth0 = cast(Op0->getType())->getNumElements(); + unsigned VWidth1 = cast(Op1->getType())->getNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast(II.getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast(II.getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + } + + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. + bool MadeChange = false; + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + IC.replaceOperand(II, 0, V); + MadeChange = true; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + IC.replaceOperand(II, 1, V); + MadeChange = true; + } + if (MadeChange) { + return &II; + } + break; + } + + case Intrinsic::x86_sse41_pblendvb: + case Intrinsic::x86_sse41_blendvps: + case Intrinsic::x86_sse41_blendvpd: + case Intrinsic::x86_avx_blendv_ps_256: + case Intrinsic::x86_avx_blendv_pd_256: + case Intrinsic::x86_avx2_pblendvb: { + // fold (blend A, A, Mask) -> A + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + Value *Mask = II.getArgOperand(2); + if (Op0 == Op1) { + return IC.replaceInstUsesWith(II, Op0); + } + + // Zero Mask - select 1st argument. + if (isa(Mask)) { + return IC.replaceInstUsesWith(II, Op0); + } + + // Constant Mask - select 1st/2nd argument lane based on top bit of mask. + if (auto *ConstantMask = dyn_cast(Mask)) { + Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); + return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); + } + + // Convert to a vector select if we can bypass casts and find a boolean + // vector condition value. + Value *BoolVec; + Mask = InstCombiner::peekThroughBitcast(Mask); + if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && + BoolVec->getType()->isVectorTy() && + BoolVec->getType()->getScalarSizeInBits() == 1) { + assert(Mask->getType()->getPrimitiveSizeInBits() == + II.getType()->getPrimitiveSizeInBits() && + "Not expecting mask and operands with different sizes"); + + unsigned NumMaskElts = + cast(Mask->getType())->getNumElements(); + unsigned NumOperandElts = + cast(II.getType())->getNumElements(); + if (NumMaskElts == NumOperandElts) { + return SelectInst::Create(BoolVec, Op1, Op0); + } + + // If the mask has less elements than the operands, each mask bit maps to + // multiple elements of the operands. Bitcast back and forth. + if (NumMaskElts < NumOperandElts) { + Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); + Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); + Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); + return new BitCastInst(Sel, II.getType()); + } + } + + break; + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + if (Value *V = simplifyX86pshufb(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + case Intrinsic::x86_avx512_permvar_df_256: + case Intrinsic::x86_avx512_permvar_df_512: + case Intrinsic::x86_avx512_permvar_di_256: + case Intrinsic::x86_avx512_permvar_di_512: + case Intrinsic::x86_avx512_permvar_hi_128: + case Intrinsic::x86_avx512_permvar_hi_256: + case Intrinsic::x86_avx512_permvar_hi_512: + case Intrinsic::x86_avx512_permvar_qi_128: + case Intrinsic::x86_avx512_permvar_qi_256: + case Intrinsic::x86_avx512_permvar_qi_512: + case Intrinsic::x86_avx512_permvar_sf_512: + case Intrinsic::x86_avx512_permvar_si_512: + if (Value *V = simplifyX86vpermv(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + case Intrinsic::x86_avx_maskload_ps: + case Intrinsic::x86_avx_maskload_pd: + case Intrinsic::x86_avx_maskload_ps_256: + case Intrinsic::x86_avx_maskload_pd_256: + case Intrinsic::x86_avx2_maskload_d: + case Intrinsic::x86_avx2_maskload_q: + case Intrinsic::x86_avx2_maskload_d_256: + case Intrinsic::x86_avx2_maskload_q_256: + if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { + return I; + } + break; + + case Intrinsic::x86_sse2_maskmov_dqu: + case Intrinsic::x86_avx_maskstore_ps: + case Intrinsic::x86_avx_maskstore_pd: + case Intrinsic::x86_avx_maskstore_ps_256: + case Intrinsic::x86_avx_maskstore_pd_256: + case Intrinsic::x86_avx2_maskstore_d: + case Intrinsic::x86_avx2_maskstore_q: + case Intrinsic::x86_avx2_maskstore_d_256: + case Intrinsic::x86_avx2_maskstore_q_256: + if (simplifyX86MaskedStore(II, IC)) { + return nullptr; + } + break; + + case Intrinsic::x86_addcarry_32: + case Intrinsic::x86_addcarry_64: + if (Value *V = simplifyX86addcarry(II, IC.Builder)) { + return IC.replaceInstUsesWith(II, V); + } + break; + + default: + break; + } + return None; +} + +Optional X86TTIImpl::simplifyDemandedUseBitsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const { + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_avx2_pmovmskb: { + // MOVMSK copies the vector elements' sign bits to the low bits + // and zeros the high bits. + unsigned ArgWidth; + if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { + ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. + } else { + auto Arg = II.getArgOperand(0); + auto ArgType = cast(Arg->getType()); + ArgWidth = ArgType->getNumElements(); + } + + // If we don't need any of low bits then return zero, + // we know that DemandedMask is non-zero already. + APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); + Type *VTy = II.getType(); + if (DemandedElts.isNullValue()) { + return ConstantInt::getNullValue(VTy); + } + + // We know that the upper bits are set to zero. + Known.Zero.setBitsFrom(ArgWidth); + KnownBitsComputed = true; + break; + } + case Intrinsic::x86_sse42_crc32_64_64: + Known.Zero.setBitsFrom(32); + KnownBitsComputed = true; + break; + } + return None; +} + +Optional X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + simplifyAndSetOp) const { + unsigned VWidth = cast(II.getType())->getNumElements(); + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: + // The instructions for these intrinsics are speced to zero upper bits not + // pass them through like other scalar intrinsics. So we shouldn't just + // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. + // Instead we should return a zero vector. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return ConstantAggregateZero::get(II.getType()); + } + + // Only the lower element is used. + DemandedElts = 1; + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // Only the lower element is undefined. The high elements are zero. + UndefElts = UndefElts[0]; + break; + + // Unary scalar-as-vector operations that work column-wise. + case Intrinsic::x86_sse_rcp_ss: + case Intrinsic::x86_sse_rsqrt_ss: + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions + // checks). + break; + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0. The low element is a function of both + // operands. + case Intrinsic::x86_sse_min_ss: + case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse_cmp_ss: + case Intrinsic::x86_sse2_min_sd: + case Intrinsic::x86_sse2_max_sd: + case Intrinsic::x86_sse2_cmp_sd: { + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + + // Lower element is undefined if both lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0]) + UndefElts.clearBit(0); + + break; + } + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element comes from operand 1. + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: { + // Don't use the low element of operand 0. + APInt DemandedElts2 = DemandedElts; + DemandedElts2.clearBit(0); + simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + + // Take the high undef elements from operand 0 and take the lower element + // from operand 1. + UndefElts.clearBit(0); + UndefElts |= UndefElts2[0]; + break; + } + + // Three input scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element is a function of all + // three inputs. + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_max_ss_round: + case Intrinsic::x86_avx512_mask_min_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + case Intrinsic::x86_avx512_mask_max_sd_round: + case Intrinsic::x86_avx512_mask_min_sd_round: + simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + IC.addToWorklist(&II); + return II.getArgOperand(0); + } + + // Only lower element is used for operand 1 and 2. + DemandedElts = 1; + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); + simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); + + // Lower element is undefined if all three lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0] || !UndefElts3[0]) + UndefElts.clearBit(0); + + break; + + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + case Intrinsic::x86_avx512_packssdw_512: + case Intrinsic::x86_avx512_packsswb_512: + case Intrinsic::x86_avx512_packusdw_512: + case Intrinsic::x86_avx512_packuswb_512: { + auto *Ty0 = II.getArgOperand(0)->getType(); + unsigned InnerVWidth = cast(Ty0)->getNumElements(); + assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); + + unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; + unsigned VWidthPerLane = VWidth / NumLanes; + unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; + + // Per lane, pack the elements of the first input and then the second. + // e.g. + // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) + // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) + for (int OpNum = 0; OpNum != 2; ++OpNum) { + APInt OpDemandedElts(InnerVWidth, 0); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + unsigned LaneIdx = Lane * VWidthPerLane; + for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { + unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; + if (DemandedElts[Idx]) + OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); + } + } + + // Demand elements from the operand. + APInt OpUndefElts(InnerVWidth, 0); + simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); + + // Pack the operand's UNDEF elements, one lane at a time. + OpUndefElts = OpUndefElts.zext(VWidth); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); + LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); + LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); + UndefElts |= LaneElts; + } + } + break; + } + + // PSHUFB + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + // PERMILVAR + case Intrinsic::x86_avx_vpermilvar_ps: + case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: + case Intrinsic::x86_avx_vpermilvar_pd: + case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: + // PERMV + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: { + simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); + break; + } + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts.setHighBits(VWidth / 2); + break; + } + return None; +} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -22,6 +22,8 @@ namespace llvm { +class InstCombiner; + class X86TTIImpl : public BasicTTIImplBase { typedef BasicTTIImplBase BaseT; typedef TargetTransformInfo TTI; @@ -151,6 +153,18 @@ int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + Optional + simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, + APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) const; + Optional simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, + APInt &UndefElts2, APInt &UndefElts3, + std::function + SimplifyAndSetOp) const; + unsigned getAtomicMemIntrinsicMaxElementSize() const; int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, diff --git a/llvm/lib/Transforms/InstCombine/CMakeLists.txt b/llvm/lib/Transforms/InstCombine/CMakeLists.txt --- a/llvm/lib/Transforms/InstCombine/CMakeLists.txt +++ b/llvm/lib/Transforms/InstCombine/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_TARGET_DEFINITIONS InstCombineTables.td) -tablegen(LLVM InstCombineTables.inc -gen-searchable-tables) -add_public_tablegen_target(InstCombineTableGen) - add_llvm_component_library(LLVMInstCombine InstructionCombining.cpp InstCombineAddSub.cpp diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -29,6 +29,7 @@ #include "llvm/Support/AlignOf.h" #include "llvm/Support/Casting.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include #include @@ -860,7 +861,7 @@ return nullptr; } -Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) { +Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) { Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); Constant *Op1C; if (!match(Op1, m_Constant(Op1C))) @@ -886,15 +887,15 @@ // zext(bool) + C -> bool ? C + 1 : C if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->getScalarSizeInBits() == 1) - return SelectInst::Create(X, AddOne(Op1C), Op1); + return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1); // sext(bool) + C -> bool ? C - 1 : C if (match(Op0, m_SExt(m_Value(X))) && X->getType()->getScalarSizeInBits() == 1) - return SelectInst::Create(X, SubOne(Op1C), Op1); + return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1); // ~X + C --> (C-1) - X if (match(Op0, m_Not(m_Value(X)))) - return BinaryOperator::CreateSub(SubOne(Op1C), X); + return BinaryOperator::CreateSub(InstCombiner::SubOne(Op1C), X); const APInt *C; if (!match(Op1, m_APInt(C))) @@ -1021,7 +1022,7 @@ // Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1) // does not overflow. -Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) { +Value *InstCombinerImpl::SimplifyAddWithRemainder(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); Value *X, *MulOpV; APInt C0, MulOpC; @@ -1097,9 +1098,9 @@ return nullptr; } -Instruction * -InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract( - BinaryOperator &I) { +Instruction *InstCombinerImpl:: + canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract( + BinaryOperator &I) { assert((I.getOpcode() == Instruction::Add || I.getOpcode() == Instruction::Or || I.getOpcode() == Instruction::Sub) && @@ -1198,7 +1199,7 @@ return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType()); } -Instruction *InstCombiner::visitAdd(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) { if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), SQ.getWithInstruction(&I))) @@ -1486,7 +1487,7 @@ : BinaryOperator::CreateFDivFMF(XY, Z, &I); } -Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) { if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1), I.getFastMathFlags(), SQ.getWithInstruction(&I))) @@ -1600,8 +1601,8 @@ /// Optimize pointer differences into the same array into a size. Consider: /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. -Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, - Type *Ty, bool IsNUW) { +Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS, + Type *Ty, bool IsNUW) { // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize // this. bool Swapped = false; @@ -1692,7 +1693,7 @@ return Builder.CreateIntCast(Result, Ty, true); } -Instruction *InstCombiner::visitSub(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) { if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), SQ.getWithInstruction(&I))) @@ -1806,14 +1807,14 @@ Value *X; if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) // C - (zext bool) --> bool ? C - 1 : C - return SelectInst::Create(X, SubOne(C), C); + return SelectInst::Create(X, InstCombiner::SubOne(C), C); if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) // C - (sext bool) --> bool ? C + 1 : C - return SelectInst::Create(X, AddOne(C), C); + return SelectInst::Create(X, InstCombiner::AddOne(C), C); // C - ~X == X + (1+C) if (match(Op1, m_Not(m_Value(X)))) - return BinaryOperator::CreateAdd(X, AddOne(C)); + return BinaryOperator::CreateAdd(X, InstCombiner::AddOne(C)); // Try to fold constant sub into select arguments. if (SelectInst *SI = dyn_cast(Op1)) @@ -2094,11 +2095,11 @@ return nullptr; } -Instruction *InstCombiner::visitFNeg(UnaryOperator &I) { +Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) { Value *Op = I.getOperand(0); if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(), - SQ.getWithInstruction(&I))) + getSimplifyQuery().getWithInstruction(&I))) return replaceInstUsesWith(I, V); if (Instruction *X = foldFNegIntoConstant(I)) @@ -2117,10 +2118,10 @@ return nullptr; } -Instruction *InstCombiner::visitFSub(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) { if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1), I.getFastMathFlags(), - SQ.getWithInstruction(&I))) + getSimplifyQuery().getWithInstruction(&I))) return replaceInstUsesWith(I, V); if (Instruction *X = foldVectorBinop(I)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -13,10 +13,12 @@ #include "InstCombineInternal.h" #include "llvm/Analysis/CmpInstAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/Local.h" + using namespace llvm; using namespace PatternMatch; @@ -114,10 +116,9 @@ /// This handles expressions of the form ((val OP C1) & C2). Where /// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. -Instruction *InstCombiner::OptAndOp(BinaryOperator *Op, - ConstantInt *OpRHS, - ConstantInt *AndRHS, - BinaryOperator &TheAnd) { +Instruction *InstCombinerImpl::OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS, + ConstantInt *AndRHS, + BinaryOperator &TheAnd) { Value *X = Op->getOperand(0); switch (Op->getOpcode()) { @@ -161,8 +162,9 @@ /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise /// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates /// whether to treat V, Lo, and Hi as signed or not. -Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi, - bool isSigned, bool Inside) { +Value *InstCombinerImpl::insertRangeTest(Value *V, const APInt &Lo, + const APInt &Hi, bool isSigned, + bool Inside) { assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) && "Lo is not < Hi in range emission code!"); @@ -437,11 +439,10 @@ /// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros /// and the right hand side is of type BMask_Mixed. For example, /// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8). -static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( - ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, - Value *A, Value *B, Value *C, Value *D, Value *E, - ICmpInst::Predicate PredL, ICmpInst::Predicate PredR, - llvm::InstCombiner::BuilderTy &Builder) { +static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( + ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C, + Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR, + InstCombiner::BuilderTy &Builder) { // We are given the canonical form: // (icmp ne (A & B), 0) & (icmp eq (A & D), E). // where D & E == E. @@ -568,11 +569,9 @@ /// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side /// aren't of the common mask pattern type. static Value *foldLogOpOfMaskedICmpsAsymmetric( - ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, - Value *A, Value *B, Value *C, Value *D, Value *E, - ICmpInst::Predicate PredL, ICmpInst::Predicate PredR, - unsigned LHSMask, unsigned RHSMask, - llvm::InstCombiner::BuilderTy &Builder) { + ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C, + Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR, + unsigned LHSMask, unsigned RHSMask, InstCombiner::BuilderTy &Builder) { assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) && "Expected equality predicates for masked type of icmps."); // Handle Mask_NotAllZeros-BMask_Mixed cases. @@ -603,7 +602,7 @@ /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) /// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, - llvm::InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); Optional> MaskPair = @@ -748,8 +747,8 @@ /// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n /// If \p Inverted is true then the check is for the inverted range, e.g. /// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n -Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, - bool Inverted) { +Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, + bool Inverted) { // Check the lower range comparison, e.g. x >= 0 // InstCombine already ensured that if there is a constant it's on the RHS. ConstantInt *RangeStart = dyn_cast(Cmp0->getOperand(1)); @@ -856,8 +855,9 @@ // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) -Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS, - BinaryOperator &Logic) { +Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, + ICmpInst *RHS, + BinaryOperator &Logic) { bool JoinedByAnd = Logic.getOpcode() == Instruction::And; assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) && "Wrong opcode"); @@ -1184,8 +1184,8 @@ } /// Fold (icmp)&(icmp) if possible. -Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, - BinaryOperator &And) { +Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, + BinaryOperator &And) { const SimplifyQuery Q = SQ.getWithInstruction(&And); // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) @@ -1404,7 +1404,8 @@ return nullptr; } -Value *InstCombiner::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) { +Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, + bool IsAnd) { Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1); Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1); FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate(); @@ -1514,8 +1515,8 @@ Value *A, *B; if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) && match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) && - !isFreeToInvert(A, A->hasOneUse()) && - !isFreeToInvert(B, B->hasOneUse())) { + !InstCombiner::isFreeToInvert(A, A->hasOneUse()) && + !InstCombiner::isFreeToInvert(B, B->hasOneUse())) { Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); return BinaryOperator::CreateNot(AndOr); } @@ -1523,7 +1524,7 @@ return nullptr; } -bool InstCombiner::shouldOptimizeCast(CastInst *CI) { +bool InstCombinerImpl::shouldOptimizeCast(CastInst *CI) { Value *CastSrc = CI->getOperand(0); // Noop casts and casts of constants should be eliminated trivially. @@ -1579,7 +1580,7 @@ } /// Fold {and,or,xor} (cast X), Y. -Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) { +Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) { auto LogicOpc = I.getOpcode(); assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding"); @@ -1725,7 +1726,7 @@ /// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and /// a common zext operand: and (binop (zext X), C), (zext X). -Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) { +Instruction *InstCombinerImpl::narrowMaskedBinOp(BinaryOperator &And) { // This transform could also apply to {or, and, xor}, but there are better // folds for those cases, so we don't expect those patterns here. AShr is not // handled because it should always be transformed to LShr in this sequence. @@ -1767,7 +1768,7 @@ // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. -Instruction *InstCombiner::visitAnd(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -2033,7 +2034,7 @@ return nullptr; } -Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) { +Instruction *InstCombinerImpl::matchBSwap(BinaryOperator &Or) { assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'"); Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1); @@ -2216,7 +2217,7 @@ /// We have an expression of the form (A & C) | (B & D). If A is a scalar or /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of /// B, it can be used as the condition operand of a select instruction. -Value *InstCombiner::getSelectCondition(Value *A, Value *B) { +Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) { // Step 1: We may have peeked through bitcasts in the caller. // Exit immediately if we don't have (vector) integer types. Type *Ty = A->getType(); @@ -2273,8 +2274,8 @@ /// We have an expression of the form (A & C) | (B & D). Try to simplify this /// to "A' ? C : D", where A' is a boolean or vector of booleans. -Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B, - Value *D) { +Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B, + Value *D) { // The potential condition of the select may be bitcasted. In that case, look // through its bitcast and the corresponding bitcast of the 'not' condition. Type *OrigType = A->getType(); @@ -2294,8 +2295,8 @@ } /// Fold (icmp)|(icmp) if possible. -Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, - BinaryOperator &Or) { +Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, + BinaryOperator &Or) { const SimplifyQuery Q = SQ.getWithInstruction(&Or); // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) @@ -2560,7 +2561,7 @@ // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. -Instruction *InstCombiner::visitOr(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -2929,8 +2930,8 @@ return nullptr; } -Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, - BinaryOperator &I) { +Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, + BinaryOperator &I) { assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS && I.getOperand(1) == RHS && "Should be 'xor' with these operands"); @@ -3088,9 +3089,9 @@ return nullptr; // We only want to do the transform if it is free to do. - if (isFreeToInvert(X, X->hasOneUse())) { + if (InstCombiner::isFreeToInvert(X, X->hasOneUse())) { // Ok, good. - } else if (isFreeToInvert(Y, Y->hasOneUse())) { + } else if (InstCombiner::isFreeToInvert(Y, Y->hasOneUse())) { std::swap(X, Y); } else return nullptr; @@ -3102,7 +3103,7 @@ // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. -Instruction *InstCombiner::visitXor(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -9,8 +9,10 @@ // This file implements the visit functions for atomic rmw instructions. // //===----------------------------------------------------------------------===// + #include "InstCombineInternal.h" #include "llvm/IR/Instructions.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" using namespace llvm; @@ -30,7 +32,7 @@ default: return false; }; - + auto C = dyn_cast(RMWI.getValOperand()); if(!C) return false; @@ -93,11 +95,11 @@ } } -Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) { +Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) { // Volatile RMWs perform a load and a store, we cannot replace this by just a // load or just a store. We chose not to canonicalize out of general paranoia - // about user expectations around volatile. + // about user expectations around volatile. if (RMWI.isVolatile()) return nullptr; @@ -115,7 +117,7 @@ "AtomicRMWs don't make sense with Unordered or NotAtomic"); // Any atomicrmw xchg with no uses can be converted to a atomic store if the - // ordering is compatible. + // ordering is compatible. if (RMWI.getOperation() == AtomicRMWInst::Xchg && RMWI.use_empty()) { if (Ordering != AtomicOrdering::Release && @@ -127,14 +129,14 @@ SI->setAlignment(DL.getABITypeAlign(RMWI.getType())); return eraseInstFromFunction(RMWI); } - + if (!isIdempotentRMW(RMWI)) return nullptr; // We chose to canonicalize all idempotent operations to an single // operation code and constant. This makes it easier for the rest of the // optimizer to match easily. The choices of or w/0 and fadd w/-0.0 are - // arbitrary. + // arbitrary. if (RMWI.getType()->isIntegerTy() && RMWI.getOperation() != AtomicRMWInst::Or) { RMWI.setOperation(AtomicRMWInst::Or); @@ -149,7 +151,7 @@ if (Ordering != AtomicOrdering::Acquire && Ordering != AtomicOrdering::Monotonic) return nullptr; - + LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "", false, DL.getABITypeAlign(RMWI.getType()), Ordering, RMWI.getSyncScopeID()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" @@ -47,9 +48,6 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/IntrinsicsHexagon.h" -#include "llvm/IR/IntrinsicsNVPTX.h" -#include "llvm/IR/IntrinsicsPowerPC.h" -#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" @@ -68,6 +66,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" #include @@ -100,24 +99,7 @@ return Ty; } -/// Return a constant boolean vector that has true elements in all positions -/// where the input constant data vector has an element with the sign bit set. -static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { - SmallVector BoolVec; - IntegerType *BoolTy = Type::getInt1Ty(V->getContext()); - for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) { - Constant *Elt = V->getElementAsConstant(I); - assert((isa(Elt) || isa(Elt)) && - "Unexpected constant data vector element type"); - bool Sign = V->getElementType()->isIntegerTy() - ? cast(Elt)->isNegative() - : cast(Elt)->isNegative(); - BoolVec.push_back(ConstantInt::get(BoolTy, Sign)); - } - return ConstantVector::get(BoolVec); -} - -Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { +Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT); MaybeAlign CopyDstAlign = MI->getDestAlign(); if (!CopyDstAlign || *CopyDstAlign < DstAlign) { @@ -232,7 +214,7 @@ return MI; } -Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { +Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) { const Align KnownAlignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); MaybeAlign MemSetAlign = MI->getDestAlign(); @@ -292,820 +274,9 @@ return nullptr; } -static Value *simplifyX86immShift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - bool LogicalShift = false; - bool ShiftLeft = false; - bool IsImm = false; - - switch (II.getIntrinsicID()) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psrai_d_512: - case Intrinsic::x86_avx512_psrai_q_512: - case Intrinsic::x86_avx512_psrai_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx512_psra_q_128: - case Intrinsic::x86_avx512_psra_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: - LogicalShift = false; - ShiftLeft = false; - break; - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrli_d_512: - case Intrinsic::x86_avx512_psrli_q_512: - case Intrinsic::x86_avx512_psrli_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: - LogicalShift = true; - ShiftLeft = false; - break; - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_pslli_d_512: - case Intrinsic::x86_avx512_pslli_q_512: - case Intrinsic::x86_avx512_pslli_w_512: - IsImm = true; - LLVM_FALLTHROUGH; - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: - LogicalShift = true; - ShiftLeft = true; - break; - } - assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - - auto Vec = II.getArgOperand(0); - auto Amt = II.getArgOperand(1); - auto VT = cast(Vec->getType()); - auto SVT = VT->getElementType(); - auto AmtVT = Amt->getType(); - unsigned VWidth = VT->getNumElements(); - unsigned BitWidth = SVT->getPrimitiveSizeInBits(); - - // If the shift amount is guaranteed to be in-range we can replace it with a - // generic shift. If its guaranteed to be out of range, logical shifts combine to - // zero and arithmetic shifts are clamped to (BitWidth - 1). - if (IsImm) { - assert(AmtVT ->isIntegerTy(32) && - "Unexpected shift-by-immediate type"); - KnownBits KnownAmtBits = - llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); - if (KnownAmtBits.getMaxValue().ult(BitWidth)) { - Amt = Builder.CreateZExtOrTrunc(Amt, SVT); - Amt = Builder.CreateVectorSplat(VWidth, Amt); - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - if (KnownAmtBits.getMinValue().uge(BitWidth)) { - if (LogicalShift) - return ConstantAggregateZero::get(VT); - Amt = ConstantInt::get(SVT, BitWidth - 1); - return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); - } - } else { - // Ensure the first element has an in-range value and the rest of the - // elements in the bottom 64 bits are zero. - assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && - cast(AmtVT)->getElementType() == SVT && - "Unexpected shift-by-scalar type"); - unsigned NumAmtElts = cast(AmtVT)->getNumElements(); - APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); - APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); - KnownBits KnownLowerBits = llvm::computeKnownBits( - Amt, DemandedLower, II.getModule()->getDataLayout()); - KnownBits KnownUpperBits = llvm::computeKnownBits( - Amt, DemandedUpper, II.getModule()->getDataLayout()); - if (KnownLowerBits.getMaxValue().ult(BitWidth) && - (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { - SmallVector ZeroSplat(VWidth, 0); - Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat); - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - } - - // Simplify if count is constant vector. - auto CDV = dyn_cast(Amt); - if (!CDV) - return nullptr; - - // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector - // operand to compute the shift amount. - assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && - cast(AmtVT)->getElementType() == SVT && - "Unexpected shift-by-scalar type"); - - // Concatenate the sub-elements to create the 64-bit value. - APInt Count(64, 0); - for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { - unsigned SubEltIdx = (NumSubElts - 1) - i; - auto SubElt = cast(CDV->getElementAsConstant(SubEltIdx)); - Count <<= BitWidth; - Count |= SubElt->getValue().zextOrTrunc(64); - } - - // If shift-by-zero then just return the original value. - if (Count.isNullValue()) - return Vec; - - // Handle cases when Shift >= BitWidth. - if (Count.uge(BitWidth)) { - // If LogicalShift - just return zero. - if (LogicalShift) - return ConstantAggregateZero::get(VT); - - // If ArithmeticShift - clamp Shift to (BitWidth - 1). - Count = APInt(64, BitWidth - 1); - } - - // Get a constant vector of the same type as the first operand. - auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); - auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); - - if (ShiftLeft) - return Builder.CreateShl(Vec, ShiftVec); - - if (LogicalShift) - return Builder.CreateLShr(Vec, ShiftVec); - - return Builder.CreateAShr(Vec, ShiftVec); -} - -// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. -// Unlike the generic IR shifts, the intrinsics have defined behaviour for out -// of range shift amounts (logical - set to zero, arithmetic - splat sign bit). -static Value *simplifyX86varShift(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - bool LogicalShift = false; - bool ShiftLeft = false; - - switch (II.getIntrinsicID()) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - case Intrinsic::x86_avx512_psrav_q_128: - case Intrinsic::x86_avx512_psrav_q_256: - case Intrinsic::x86_avx512_psrav_d_512: - case Intrinsic::x86_avx512_psrav_q_512: - case Intrinsic::x86_avx512_psrav_w_128: - case Intrinsic::x86_avx512_psrav_w_256: - case Intrinsic::x86_avx512_psrav_w_512: - LogicalShift = false; - ShiftLeft = false; - break; - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx512_psrlv_d_512: - case Intrinsic::x86_avx512_psrlv_q_512: - case Intrinsic::x86_avx512_psrlv_w_128: - case Intrinsic::x86_avx512_psrlv_w_256: - case Intrinsic::x86_avx512_psrlv_w_512: - LogicalShift = true; - ShiftLeft = false; - break; - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx512_psllv_d_512: - case Intrinsic::x86_avx512_psllv_q_512: - case Intrinsic::x86_avx512_psllv_w_128: - case Intrinsic::x86_avx512_psllv_w_256: - case Intrinsic::x86_avx512_psllv_w_512: - LogicalShift = true; - ShiftLeft = true; - break; - } - assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - - auto Vec = II.getArgOperand(0); - auto Amt = II.getArgOperand(1); - auto VT = cast(II.getType()); - auto SVT = VT->getElementType(); - int NumElts = VT->getNumElements(); - int BitWidth = SVT->getIntegerBitWidth(); - - // If the shift amount is guaranteed to be in-range we can replace it with a - // generic shift. - APInt UpperBits = - APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); - if (llvm::MaskedValueIsZero(Amt, UpperBits, - II.getModule()->getDataLayout())) { - return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) - : Builder.CreateLShr(Vec, Amt)) - : Builder.CreateAShr(Vec, Amt)); - } - - // Simplify if all shift amounts are constant/undef. - auto *CShift = dyn_cast(Amt); - if (!CShift) - return nullptr; - - // Collect each element's shift amount. - // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. - bool AnyOutOfRange = false; - SmallVector ShiftAmts; - for (int I = 0; I < NumElts; ++I) { - auto *CElt = CShift->getAggregateElement(I); - if (CElt && isa(CElt)) { - ShiftAmts.push_back(-1); - continue; - } - - auto *COp = dyn_cast_or_null(CElt); - if (!COp) - return nullptr; - - // Handle out of range shifts. - // If LogicalShift - set to BitWidth (special case). - // If ArithmeticShift - set to (BitWidth - 1) (sign splat). - APInt ShiftVal = COp->getValue(); - if (ShiftVal.uge(BitWidth)) { - AnyOutOfRange = LogicalShift; - ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); - continue; - } - - ShiftAmts.push_back((int)ShiftVal.getZExtValue()); - } - - // If all elements out of range or UNDEF, return vector of zeros/undefs. - // ArithmeticShift should only hit this if they are all UNDEF. - auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; - if (llvm::all_of(ShiftAmts, OutOfRange)) { - SmallVector ConstantVec; - for (int Idx : ShiftAmts) { - if (Idx < 0) { - ConstantVec.push_back(UndefValue::get(SVT)); - } else { - assert(LogicalShift && "Logical shift expected"); - ConstantVec.push_back(ConstantInt::getNullValue(SVT)); - } - } - return ConstantVector::get(ConstantVec); - } - - // We can't handle only some out of range values with generic logical shifts. - if (AnyOutOfRange) - return nullptr; - - // Build the shift amount constant vector. - SmallVector ShiftVecAmts; - for (int Idx : ShiftAmts) { - if (Idx < 0) - ShiftVecAmts.push_back(UndefValue::get(SVT)); - else - ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); - } - auto ShiftVec = ConstantVector::get(ShiftVecAmts); - - if (ShiftLeft) - return Builder.CreateShl(Vec, ShiftVec); - - if (LogicalShift) - return Builder.CreateLShr(Vec, ShiftVec); - - return Builder.CreateAShr(Vec, ShiftVec); -} - -static Value *simplifyX86pack(IntrinsicInst &II, - InstCombiner::BuilderTy &Builder, bool IsSigned) { - Value *Arg0 = II.getArgOperand(0); - Value *Arg1 = II.getArgOperand(1); - Type *ResTy = II.getType(); - - // Fast all undef handling. - if (isa(Arg0) && isa(Arg1)) - return UndefValue::get(ResTy); - - auto *ArgTy = cast(Arg0->getType()); - unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; - unsigned NumSrcElts = ArgTy->getNumElements(); - assert(cast(ResTy)->getNumElements() == (2 * NumSrcElts) && - "Unexpected packing types"); - - unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; - unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); - unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); - assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && - "Unexpected packing types"); - - // Constant folding. - if (!isa(Arg0) || !isa(Arg1)) - return nullptr; - - // Clamp Values - signed/unsigned both use signed clamp values, but they - // differ on the min/max values. - APInt MinValue, MaxValue; - if (IsSigned) { - // PACKSS: Truncate signed value with signed saturation. - // Source values less than dst minint are saturated to minint. - // Source values greater than dst maxint are saturated to maxint. - MinValue = - APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); - MaxValue = - APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); - } else { - // PACKUS: Truncate signed value with unsigned saturation. - // Source values less than zero are saturated to zero. - // Source values greater than dst maxuint are saturated to maxuint. - MinValue = APInt::getNullValue(SrcScalarSizeInBits); - MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); - } - - auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); - auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); - - // Shuffle clamped args together at the lane level. - SmallVector PackMask; - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) - PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); - for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) - PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); - } - auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); - - // Truncate to dst size. - return Builder.CreateTrunc(Shuffle, ResTy); -} - -static Value *simplifyX86movmsk(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Value *Arg = II.getArgOperand(0); - Type *ResTy = II.getType(); - - // movmsk(undef) -> zero as we must ensure the upper bits are zero. - if (isa(Arg)) - return Constant::getNullValue(ResTy); - - auto *ArgTy = dyn_cast(Arg->getType()); - // We can't easily peek through x86_mmx types. - if (!ArgTy) - return nullptr; - - // Expand MOVMSK to compare/bitcast/zext: - // e.g. PMOVMSKB(v16i8 x): - // %cmp = icmp slt <16 x i8> %x, zeroinitializer - // %int = bitcast <16 x i1> %cmp to i16 - // %res = zext i16 %int to i32 - unsigned NumElts = ArgTy->getNumElements(); - Type *IntegerVecTy = VectorType::getInteger(ArgTy); - Type *IntegerTy = Builder.getIntNTy(NumElts); - - Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); - Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); - Res = Builder.CreateBitCast(Res, IntegerTy); - Res = Builder.CreateZExtOrTrunc(Res, ResTy); - return Res; -} - -static Value *simplifyX86addcarry(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Value *CarryIn = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - Value *Op2 = II.getArgOperand(2); - Type *RetTy = II.getType(); - Type *OpTy = Op1->getType(); - assert(RetTy->getStructElementType(0)->isIntegerTy(8) && - RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && - "Unexpected types for x86 addcarry"); - - // If carry-in is zero, this is just an unsigned add with overflow. - if (match(CarryIn, m_ZeroInt())) { - Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, - { Op1, Op2 }); - // The types have to be adjusted to match the x86 call types. - Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); - Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), - Builder.getInt8Ty()); - Value *Res = UndefValue::get(RetTy); - Res = Builder.CreateInsertValue(Res, UAddOV, 0); - return Builder.CreateInsertValue(Res, UAddResult, 1); - } - - return nullptr; -} - -static Value *simplifyX86insertps(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - auto *CInt = dyn_cast(II.getArgOperand(2)); - if (!CInt) - return nullptr; - - VectorType *VecTy = cast(II.getType()); - assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); - - // The immediate permute control byte looks like this: - // [3:0] - zero mask for each 32-bit lane - // [5:4] - select one 32-bit destination lane - // [7:6] - select one 32-bit source lane - - uint8_t Imm = CInt->getZExtValue(); - uint8_t ZMask = Imm & 0xf; - uint8_t DestLane = (Imm >> 4) & 0x3; - uint8_t SourceLane = (Imm >> 6) & 0x3; - - ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); - - // If all zero mask bits are set, this was just a weird way to - // generate a zero vector. - if (ZMask == 0xf) - return ZeroVector; - - // Initialize by passing all of the first source bits through. - int ShuffleMask[4] = {0, 1, 2, 3}; - - // We may replace the second operand with the zero vector. - Value *V1 = II.getArgOperand(1); - - if (ZMask) { - // If the zero mask is being used with a single input or the zero mask - // overrides the destination lane, this is a shuffle with the zero vector. - if ((II.getArgOperand(0) == II.getArgOperand(1)) || - (ZMask & (1 << DestLane))) { - V1 = ZeroVector; - // We may still move 32-bits of the first source vector from one lane - // to another. - ShuffleMask[DestLane] = SourceLane; - // The zero mask may override the previous insert operation. - for (unsigned i = 0; i < 4; ++i) - if ((ZMask >> i) & 0x1) - ShuffleMask[i] = i + 4; - } else { - // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? - return nullptr; - } - } else { - // Replace the selected destination lane with the selected source lane. - ShuffleMask[DestLane] = SourceLane + 4; - } - - return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); -} - -/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding -/// or conversion to a shuffle vector. -static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, - ConstantInt *CILength, ConstantInt *CIIndex, - InstCombiner::BuilderTy &Builder) { - auto LowConstantHighUndef = [&](uint64_t Val) { - Type *IntTy64 = Type::getInt64Ty(II.getContext()); - Constant *Args[] = {ConstantInt::get(IntTy64, Val), - UndefValue::get(IntTy64)}; - return ConstantVector::get(Args); - }; - - // See if we're dealing with constant values. - Constant *C0 = dyn_cast(Op0); - ConstantInt *CI0 = - C0 ? dyn_cast_or_null(C0->getAggregateElement((unsigned)0)) - : nullptr; - - // Attempt to constant fold. - if (CILength && CIIndex) { - // From AMD documentation: "The bit index and field length are each six - // bits in length other bits of the field are ignored." - APInt APIndex = CIIndex->getValue().zextOrTrunc(6); - APInt APLength = CILength->getValue().zextOrTrunc(6); - - unsigned Index = APIndex.getZExtValue(); - - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - unsigned End = Index + Length; - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if (End > 64) - return UndefValue::get(II.getType()); - - // If we are inserting whole bytes, we can convert this to a shuffle. - // Lowering can recognize EXTRQI shuffle masks. - if ((Length % 8) == 0 && (Index % 8) == 0) { - // Convert bit indices to byte indices. - Length /= 8; - Index /= 8; - - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - auto *ShufTy = FixedVectorType::get(IntTy8, 16); - - SmallVector ShuffleMask; - for (int i = 0; i != (int)Length; ++i) - ShuffleMask.push_back(i + Index); - for (int i = Length; i != 8; ++i) - ShuffleMask.push_back(i + 16); - for (int i = 8; i != 16; ++i) - ShuffleMask.push_back(-1); - - Value *SV = Builder.CreateShuffleVector( - Builder.CreateBitCast(Op0, ShufTy), - ConstantAggregateZero::get(ShufTy), ShuffleMask); - return Builder.CreateBitCast(SV, II.getType()); - } - - // Constant Fold - shift Index'th bit to lowest position and mask off - // Length bits. - if (CI0) { - APInt Elt = CI0->getValue(); - Elt.lshrInPlace(Index); - Elt = Elt.zextOrTrunc(Length); - return LowConstantHighUndef(Elt.getZExtValue()); - } - - // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. - if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { - Value *Args[] = {Op0, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); - return Builder.CreateCall(F, Args); - } - } - - // Constant Fold - extraction from zero is always {zero, undef}. - if (CI0 && CI0->isZero()) - return LowConstantHighUndef(0); - - return nullptr; -} - -/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant -/// folding or conversion to a shuffle vector. -static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, - APInt APLength, APInt APIndex, - InstCombiner::BuilderTy &Builder) { - // From AMD documentation: "The bit index and field length are each six bits - // in length other bits of the field are ignored." - APIndex = APIndex.zextOrTrunc(6); - APLength = APLength.zextOrTrunc(6); - - // Attempt to constant fold. - unsigned Index = APIndex.getZExtValue(); - - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - unsigned End = Index + Length; - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if (End > 64) - return UndefValue::get(II.getType()); - - // If we are inserting whole bytes, we can convert this to a shuffle. - // Lowering can recognize INSERTQI shuffle masks. - if ((Length % 8) == 0 && (Index % 8) == 0) { - // Convert bit indices to byte indices. - Length /= 8; - Index /= 8; - - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - auto *ShufTy = FixedVectorType::get(IntTy8, 16); - - SmallVector ShuffleMask; - for (int i = 0; i != (int)Index; ++i) - ShuffleMask.push_back(i); - for (int i = 0; i != (int)Length; ++i) - ShuffleMask.push_back(i + 16); - for (int i = Index + Length; i != 8; ++i) - ShuffleMask.push_back(i); - for (int i = 8; i != 16; ++i) - ShuffleMask.push_back(-1); - - Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), - Builder.CreateBitCast(Op1, ShufTy), - ShuffleMask); - return Builder.CreateBitCast(SV, II.getType()); - } - - // See if we're dealing with constant values. - Constant *C0 = dyn_cast(Op0); - Constant *C1 = dyn_cast(Op1); - ConstantInt *CI00 = - C0 ? dyn_cast_or_null(C0->getAggregateElement((unsigned)0)) - : nullptr; - ConstantInt *CI10 = - C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)0)) - : nullptr; - - // Constant Fold - insert bottom Length bits starting at the Index'th bit. - if (CI00 && CI10) { - APInt V00 = CI00->getValue(); - APInt V10 = CI10->getValue(); - APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); - V00 = V00 & ~Mask; - V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); - APInt Val = V00 | V10; - Type *IntTy64 = Type::getInt64Ty(II.getContext()); - Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), - UndefValue::get(IntTy64)}; - return ConstantVector::get(Args); - } - - // If we were an INSERTQ call, we'll save demanded elements if we convert to - // INSERTQI. - if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { - Type *IntTy8 = Type::getInt8Ty(II.getContext()); - Constant *CILength = ConstantInt::get(IntTy8, Length, false); - Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); - - Value *Args[] = {Op0, Op1, CILength, CIIndex}; - Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return Builder.CreateCall(F, Args); - } - - return nullptr; -} - -/// Attempt to convert pshufb* to shufflevector if the mask is constant. -static Value *simplifyX86pshufb(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Constant *V = dyn_cast(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && - "Unexpected number of elements in shuffle mask!"); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[64]; - - // Each byte in the shuffle control mask forms an index to permute the - // corresponding byte in the destination operand. - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa(COp) && !isa(COp))) - return nullptr; - - if (isa(COp)) { - Indexes[I] = -1; - continue; - } - - int8_t Index = cast(COp)->getValue().getZExtValue(); - - // If the most significant bit (bit[7]) of each byte of the shuffle - // control mask is set, then zero is written in the result byte. - // The zero vector is in the right-hand side of the resulting - // shufflevector. - - // The value of each index for the high 128-bit lane is the least - // significant 4 bits of the respective shuffle control byte. - Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); - Indexes[I] = Index; - } - - auto V1 = II.getArgOperand(0); - auto V2 = Constant::getNullValue(VecTy); - return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); -} - -/// Attempt to convert vpermilvar* to shufflevector if the mask is constant. -static Value *simplifyX86vpermilvar(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - Constant *V = dyn_cast(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast(II.getType()); - unsigned NumElts = VecTy->getNumElements(); - bool IsPD = VecTy->getScalarType()->isDoubleTy(); - unsigned NumLaneElts = IsPD ? 2 : 4; - assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[16]; - - // The intrinsics only read one or two bits, clear the rest. - for (unsigned I = 0; I < NumElts; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa(COp) && !isa(COp))) - return nullptr; - - if (isa(COp)) { - Indexes[I] = -1; - continue; - } - - APInt Index = cast(COp)->getValue(); - Index = Index.zextOrTrunc(32).getLoBits(2); - - // The PD variants uses bit 1 to select per-lane element index, so - // shift down to convert to generic shuffle mask index. - if (IsPD) - Index.lshrInPlace(1); - - // The _256 variants are a bit trickier since the mask bits always index - // into the corresponding 128 half. In order to convert to a generic - // shuffle, we have to make that explicit. - Index += APInt(32, (I / NumLaneElts) * NumLaneElts); - - Indexes[I] = Index.getZExtValue(); - } - - auto V1 = II.getArgOperand(0); - auto V2 = UndefValue::get(V1->getType()); - return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); -} - -/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. -static Value *simplifyX86vpermv(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - auto *V = dyn_cast(II.getArgOperand(1)); - if (!V) - return nullptr; - - auto *VecTy = cast(II.getType()); - unsigned Size = VecTy->getNumElements(); - assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && - "Unexpected shuffle mask size"); - - // Construct a shuffle mask from constant integers or UNDEFs. - int Indexes[64]; - - for (unsigned I = 0; I < Size; ++I) { - Constant *COp = V->getAggregateElement(I); - if (!COp || (!isa(COp) && !isa(COp))) - return nullptr; - - if (isa(COp)) { - Indexes[I] = -1; - continue; - } - - uint32_t Index = cast(COp)->getZExtValue(); - Index &= Size - 1; - Indexes[I] = Index; - } - - auto V1 = II.getArgOperand(0); - auto V2 = UndefValue::get(VecTy); - return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size)); -} - // TODO, Obvious Missing Transforms: // * Narrow width by halfs excluding zero/undef lanes -Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { +Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) { Value *LoadPtr = II.getArgOperand(0); const Align Alignment = cast(II.getArgOperand(1))->getAlignValue(); @@ -1132,7 +303,7 @@ // TODO, Obvious Missing Transforms: // * Single constant active lane -> store // * Narrow width by halfs excluding zero/undef lanes -Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { +Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) { auto *ConstMask = dyn_cast(II.getArgOperand(3)); if (!ConstMask) return nullptr; @@ -1165,7 +336,7 @@ // * Narrow width by halfs excluding zero/undef lanes // * Vector splat address w/known mask -> scalar load // * Vector incrementing address -> vector masked load -Instruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) { +Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) { return nullptr; } @@ -1175,7 +346,7 @@ // * Narrow store width by halfs excluding zero/undef lanes // * Vector splat address w/known mask -> scalar store // * Vector incrementing address -> vector masked store -Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) { +Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { auto *ConstMask = dyn_cast(II.getArgOperand(3)); if (!ConstMask) return nullptr; @@ -1206,7 +377,7 @@ /// This is legal because it preserves the most recent information about /// the presence or absence of invariant.group. static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II, - InstCombiner &IC) { + InstCombinerImpl &IC) { auto *Arg = II.getArgOperand(0); auto *StrippedArg = Arg->stripPointerCasts(); auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups(); @@ -1231,7 +402,7 @@ return cast(Result); } -static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { +static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { assert((II.getIntrinsicID() == Intrinsic::cttz || II.getIntrinsicID() == Intrinsic::ctlz) && "Expected cttz or ctlz intrinsic"); @@ -1301,7 +472,7 @@ return nullptr; } -static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { +static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { assert(II.getIntrinsicID() == Intrinsic::ctpop && "Expected ctpop intrinsic"); Type *Ty = II.getType(); @@ -1356,107 +527,6 @@ return nullptr; } -// TODO: If the x86 backend knew how to convert a bool vector mask back to an -// XMM register mask efficiently, we could transform all x86 masked intrinsics -// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. -static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { - Value *Ptr = II.getOperand(0); - Value *Mask = II.getOperand(1); - Constant *ZeroVec = Constant::getNullValue(II.getType()); - - // Special case a zero mask since that's not a ConstantDataVector. - // This masked load instruction creates a zero vector. - if (isa(Mask)) - return IC.replaceInstUsesWith(II, ZeroVec); - - auto *ConstMask = dyn_cast(Mask); - if (!ConstMask) - return nullptr; - - // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic - // to allow target-independent optimizations. - - // First, cast the x86 intrinsic scalar pointer to a vector pointer to match - // the LLVM intrinsic definition for the pointer argument. - unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - - // Second, convert the x86 XMM integer vector mask to a vector of bools based - // on each element's most significant bit (the sign bit). - Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); - - // The pass-through vector for an x86 masked load is a zero vector. - CallInst *NewMaskedLoad = - IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); - return IC.replaceInstUsesWith(II, NewMaskedLoad); -} - -// TODO: If the x86 backend knew how to convert a bool vector mask back to an -// XMM register mask efficiently, we could transform all x86 masked intrinsics -// to LLVM masked intrinsics and remove the x86 masked intrinsic defs. -static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { - Value *Ptr = II.getOperand(0); - Value *Mask = II.getOperand(1); - Value *Vec = II.getOperand(2); - - // Special case a zero mask since that's not a ConstantDataVector: - // this masked store instruction does nothing. - if (isa(Mask)) { - IC.eraseInstFromFunction(II); - return true; - } - - // The SSE2 version is too weird (eg, unaligned but non-temporal) to do - // anything else at this level. - if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) - return false; - - auto *ConstMask = dyn_cast(Mask); - if (!ConstMask) - return false; - - // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic - // to allow target-independent optimizations. - - // First, cast the x86 intrinsic scalar pointer to a vector pointer to match - // the LLVM intrinsic definition for the pointer argument. - unsigned AddrSpace = cast(Ptr->getType())->getAddressSpace(); - PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); - Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); - - // Second, convert the x86 XMM integer vector mask to a vector of bools based - // on each element's most significant bit (the sign bit). - Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); - - IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); - - // 'Replace uses' doesn't work for stores. Erase the original masked store. - IC.eraseInstFromFunction(II); - return true; -} - -// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. -// -// A single NaN input is folded to minnum, so we rely on that folding for -// handling NaNs. -static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, - const APFloat &Src2) { - APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); - - APFloat::cmpResult Cmp0 = Max3.compare(Src0); - assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); - if (Cmp0 == APFloat::cmpEqual) - return maxnum(Src1, Src2); - - APFloat::cmpResult Cmp1 = Max3.compare(Src1); - assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); - if (Cmp1 == APFloat::cmpEqual) - return maxnum(Src0, Src2); - - return maxnum(Src0, Src1); -} - /// Convert a table lookup to shufflevector if the mask is constant. /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in /// which case we could lower the shufflevector with rev64 instructions @@ -1495,28 +565,6 @@ return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes)); } -/// Convert a vector load intrinsic into a simple llvm load instruction. -/// This is beneficial when the underlying object being addressed comes -/// from a constant, since we get constant-folding for free. -static Value *simplifyNeonVld1(const IntrinsicInst &II, - unsigned MemAlign, - InstCombiner::BuilderTy &Builder) { - auto *IntrAlign = dyn_cast(II.getArgOperand(1)); - - if (!IntrAlign) - return nullptr; - - unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ? - MemAlign : IntrAlign->getLimitedValue(); - - if (!isPowerOf2_32(Alignment)) - return nullptr; - - auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0), - PointerType::get(II.getType(), 0)); - return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment)); -} - // Returns true iff the 2 intrinsics have the same operands, limiting the // comparison to the first NumOperands. static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E, @@ -1538,9 +586,9 @@ // call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed // call @llvm.foo.end(i1 0) // call @llvm.foo.end(i1 0) ; &I -static bool removeTriviallyEmptyRange( - IntrinsicInst &EndI, InstCombiner &IC, - std::function IsStart) { +static bool +removeTriviallyEmptyRange(IntrinsicInst &EndI, InstCombinerImpl &IC, + std::function IsStart) { // We start from the end intrinsic and scan backwards, so that InstCombine // has already processed (and potentially removed) all the instructions // before the end intrinsic. @@ -1566,256 +614,7 @@ return false; } -// Convert NVVM intrinsics to target-generic LLVM code where possible. -static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) { - // Each NVVM intrinsic we can simplify can be replaced with one of: - // - // * an LLVM intrinsic, - // * an LLVM cast operation, - // * an LLVM binary operation, or - // * ad-hoc LLVM IR for the particular operation. - - // Some transformations are only valid when the module's - // flush-denormals-to-zero (ftz) setting is true/false, whereas other - // transformations are valid regardless of the module's ftz setting. - enum FtzRequirementTy { - FTZ_Any, // Any ftz setting is ok. - FTZ_MustBeOn, // Transformation is valid only if ftz is on. - FTZ_MustBeOff, // Transformation is valid only if ftz is off. - }; - // Classes of NVVM intrinsics that can't be replaced one-to-one with a - // target-generic intrinsic, cast op, or binary op but that we can nonetheless - // simplify. - enum SpecialCase { - SPC_Reciprocal, - }; - - // SimplifyAction is a poor-man's variant (plus an additional flag) that - // represents how to replace an NVVM intrinsic with target-generic LLVM IR. - struct SimplifyAction { - // Invariant: At most one of these Optionals has a value. - Optional IID; - Optional CastOp; - Optional BinaryOp; - Optional Special; - - FtzRequirementTy FtzRequirement = FTZ_Any; - - SimplifyAction() = default; - - SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq) - : IID(IID), FtzRequirement(FtzReq) {} - - // Cast operations don't have anything to do with FTZ, so we skip that - // argument. - SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} - - SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) - : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} - - SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) - : Special(Special), FtzRequirement(FtzReq) {} - }; - - // Try to generate a SimplifyAction describing how to replace our - // IntrinsicInstr with target-generic LLVM IR. - const SimplifyAction Action = [II]() -> SimplifyAction { - switch (II->getIntrinsicID()) { - // NVVM intrinsics that map directly to LLVM intrinsics. - case Intrinsic::nvvm_ceil_d: - return {Intrinsic::ceil, FTZ_Any}; - case Intrinsic::nvvm_ceil_f: - return {Intrinsic::ceil, FTZ_MustBeOff}; - case Intrinsic::nvvm_ceil_ftz_f: - return {Intrinsic::ceil, FTZ_MustBeOn}; - case Intrinsic::nvvm_fabs_d: - return {Intrinsic::fabs, FTZ_Any}; - case Intrinsic::nvvm_fabs_f: - return {Intrinsic::fabs, FTZ_MustBeOff}; - case Intrinsic::nvvm_fabs_ftz_f: - return {Intrinsic::fabs, FTZ_MustBeOn}; - case Intrinsic::nvvm_floor_d: - return {Intrinsic::floor, FTZ_Any}; - case Intrinsic::nvvm_floor_f: - return {Intrinsic::floor, FTZ_MustBeOff}; - case Intrinsic::nvvm_floor_ftz_f: - return {Intrinsic::floor, FTZ_MustBeOn}; - case Intrinsic::nvvm_fma_rn_d: - return {Intrinsic::fma, FTZ_Any}; - case Intrinsic::nvvm_fma_rn_f: - return {Intrinsic::fma, FTZ_MustBeOff}; - case Intrinsic::nvvm_fma_rn_ftz_f: - return {Intrinsic::fma, FTZ_MustBeOn}; - case Intrinsic::nvvm_fmax_d: - return {Intrinsic::maxnum, FTZ_Any}; - case Intrinsic::nvvm_fmax_f: - return {Intrinsic::maxnum, FTZ_MustBeOff}; - case Intrinsic::nvvm_fmax_ftz_f: - return {Intrinsic::maxnum, FTZ_MustBeOn}; - case Intrinsic::nvvm_fmin_d: - return {Intrinsic::minnum, FTZ_Any}; - case Intrinsic::nvvm_fmin_f: - return {Intrinsic::minnum, FTZ_MustBeOff}; - case Intrinsic::nvvm_fmin_ftz_f: - return {Intrinsic::minnum, FTZ_MustBeOn}; - case Intrinsic::nvvm_round_d: - return {Intrinsic::round, FTZ_Any}; - case Intrinsic::nvvm_round_f: - return {Intrinsic::round, FTZ_MustBeOff}; - case Intrinsic::nvvm_round_ftz_f: - return {Intrinsic::round, FTZ_MustBeOn}; - case Intrinsic::nvvm_sqrt_rn_d: - return {Intrinsic::sqrt, FTZ_Any}; - case Intrinsic::nvvm_sqrt_f: - // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the - // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts - // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are - // the versions with explicit ftz-ness. - return {Intrinsic::sqrt, FTZ_Any}; - case Intrinsic::nvvm_sqrt_rn_f: - return {Intrinsic::sqrt, FTZ_MustBeOff}; - case Intrinsic::nvvm_sqrt_rn_ftz_f: - return {Intrinsic::sqrt, FTZ_MustBeOn}; - case Intrinsic::nvvm_trunc_d: - return {Intrinsic::trunc, FTZ_Any}; - case Intrinsic::nvvm_trunc_f: - return {Intrinsic::trunc, FTZ_MustBeOff}; - case Intrinsic::nvvm_trunc_ftz_f: - return {Intrinsic::trunc, FTZ_MustBeOn}; - - // NVVM intrinsics that map to LLVM cast operations. - // - // Note that llvm's target-generic conversion operators correspond to the rz - // (round to zero) versions of the nvvm conversion intrinsics, even though - // most everything else here uses the rn (round to nearest even) nvvm ops. - case Intrinsic::nvvm_d2i_rz: - case Intrinsic::nvvm_f2i_rz: - case Intrinsic::nvvm_d2ll_rz: - case Intrinsic::nvvm_f2ll_rz: - return {Instruction::FPToSI}; - case Intrinsic::nvvm_d2ui_rz: - case Intrinsic::nvvm_f2ui_rz: - case Intrinsic::nvvm_d2ull_rz: - case Intrinsic::nvvm_f2ull_rz: - return {Instruction::FPToUI}; - case Intrinsic::nvvm_i2d_rz: - case Intrinsic::nvvm_i2f_rz: - case Intrinsic::nvvm_ll2d_rz: - case Intrinsic::nvvm_ll2f_rz: - return {Instruction::SIToFP}; - case Intrinsic::nvvm_ui2d_rz: - case Intrinsic::nvvm_ui2f_rz: - case Intrinsic::nvvm_ull2d_rz: - case Intrinsic::nvvm_ull2f_rz: - return {Instruction::UIToFP}; - - // NVVM intrinsics that map to LLVM binary ops. - case Intrinsic::nvvm_add_rn_d: - return {Instruction::FAdd, FTZ_Any}; - case Intrinsic::nvvm_add_rn_f: - return {Instruction::FAdd, FTZ_MustBeOff}; - case Intrinsic::nvvm_add_rn_ftz_f: - return {Instruction::FAdd, FTZ_MustBeOn}; - case Intrinsic::nvvm_mul_rn_d: - return {Instruction::FMul, FTZ_Any}; - case Intrinsic::nvvm_mul_rn_f: - return {Instruction::FMul, FTZ_MustBeOff}; - case Intrinsic::nvvm_mul_rn_ftz_f: - return {Instruction::FMul, FTZ_MustBeOn}; - case Intrinsic::nvvm_div_rn_d: - return {Instruction::FDiv, FTZ_Any}; - case Intrinsic::nvvm_div_rn_f: - return {Instruction::FDiv, FTZ_MustBeOff}; - case Intrinsic::nvvm_div_rn_ftz_f: - return {Instruction::FDiv, FTZ_MustBeOn}; - - // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but - // need special handling. - // - // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just - // as well. - case Intrinsic::nvvm_rcp_rn_d: - return {SPC_Reciprocal, FTZ_Any}; - case Intrinsic::nvvm_rcp_rn_f: - return {SPC_Reciprocal, FTZ_MustBeOff}; - case Intrinsic::nvvm_rcp_rn_ftz_f: - return {SPC_Reciprocal, FTZ_MustBeOn}; - - // We do not currently simplify intrinsics that give an approximate answer. - // These include: - // - // - nvvm_cos_approx_{f,ftz_f} - // - nvvm_ex2_approx_{d,f,ftz_f} - // - nvvm_lg2_approx_{d,f,ftz_f} - // - nvvm_sin_approx_{f,ftz_f} - // - nvvm_sqrt_approx_{f,ftz_f} - // - nvvm_rsqrt_approx_{d,f,ftz_f} - // - nvvm_div_approx_{ftz_d,ftz_f,f} - // - nvvm_rcp_approx_ftz_d - // - // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" - // means that fastmath is enabled in the intrinsic. Unfortunately only - // binary operators (currently) have a fastmath bit in SelectionDAG, so this - // information gets lost and we can't select on it. - // - // TODO: div and rcp are lowered to a binary op, so these we could in theory - // lower them to "fast fdiv". - - default: - return {}; - } - }(); - - // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we - // can bail out now. (Notice that in the case that IID is not an NVVM - // intrinsic, we don't have to look up any module metadata, as - // FtzRequirementTy will be FTZ_Any.) - if (Action.FtzRequirement != FTZ_Any) { - StringRef Attr = II->getFunction() - ->getFnAttribute("denormal-fp-math-f32") - .getValueAsString(); - DenormalMode Mode = parseDenormalFPAttribute(Attr); - bool FtzEnabled = Mode.Output != DenormalMode::IEEE; - - if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) - return nullptr; - } - - // Simplify to target-generic intrinsic. - if (Action.IID) { - SmallVector Args(II->arg_operands()); - // All the target-generic intrinsics currently of interest to us have one - // type argument, equal to that of the nvvm intrinsic's argument. - Type *Tys[] = {II->getArgOperand(0)->getType()}; - return CallInst::Create( - Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); - } - - // Simplify to target-generic binary op. - if (Action.BinaryOp) - return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0), - II->getArgOperand(1), II->getName()); - - // Simplify to target-generic cast op. - if (Action.CastOp) - return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(), - II->getName()); - - // All that's left are the special cases. - if (!Action.Special) - return nullptr; - - switch (*Action.Special) { - case SPC_Reciprocal: - // Simplify reciprocal. - return BinaryOperator::Create( - Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1), - II->getArgOperand(0), II->getName()); - } - llvm_unreachable("All SpecialCase enumerators should be handled in switch."); -} - -Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) { +Instruction *InstCombinerImpl::visitVAEndInst(VAEndInst &I) { removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) { return I.getIntrinsicID() == Intrinsic::vastart || I.getIntrinsicID() == Intrinsic::vacopy; @@ -1834,20 +633,31 @@ return nullptr; } -Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { +/// Creates a result tuple for an overflow intrinsic \p II with a given +/// \p Result and a constant \p Overflow value. +static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result, + Constant *Overflow) { + Constant *V[] = {UndefValue::get(Result->getType()), Overflow}; + StructType *ST = cast(II->getType()); + Constant *Struct = ConstantStruct::get(ST, V); + return InsertValueInst::Create(Struct, Result, 0); +} + +Instruction * +InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) { WithOverflowInst *WO = cast(II); Value *OperationResult = nullptr; Constant *OverflowResult = nullptr; if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(), WO->getRHS(), *WO, OperationResult, OverflowResult)) - return CreateOverflowTuple(WO, OperationResult, OverflowResult); + return createOverflowTuple(WO, OperationResult, OverflowResult); return nullptr; } /// CallInst simplification. This mostly only handles folding of intrinsic /// instructions. For normal calls, it allows visitCallBase to do the heavy /// lifting. -Instruction *InstCombiner::visitCallInst(CallInst &CI) { +Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // Don't try to simplify calls without uses. It will not do anything useful, // but will result in the following folds being skipped. if (!CI.use_empty()) @@ -1953,19 +763,8 @@ } } - if (Instruction *I = SimplifyNVVMIntrinsic(II, *this)) - return I; - - auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, - unsigned DemandedWidth) { - APInt UndefElts(Width, 0); - APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); - return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); - }; - Intrinsic::ID IID = II->getIntrinsicID(); switch (IID) { - default: break; case Intrinsic::objectsize: if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) return replaceInstUsesWith(CI, V); @@ -2465,932 +1264,6 @@ } break; } - case Intrinsic::ppc_altivec_lvx: - case Intrinsic::ppc_altivec_lvxl: - // Turn PPC lvx -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC, - &DT) >= 16) { - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), - PointerType::getUnqual(II->getType())); - return new LoadInst(II->getType(), Ptr, "", false, Align(16)); - } - break; - case Intrinsic::ppc_vsx_lxvw4x: - case Intrinsic::ppc_vsx_lxvd2x: { - // Turn PPC VSX loads into normal loads. - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), - PointerType::getUnqual(II->getType())); - return new LoadInst(II->getType(), Ptr, Twine(""), false, Align(1)); - } - case Intrinsic::ppc_altivec_stvx: - case Intrinsic::ppc_altivec_stvxl: - // Turn stvx -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC, - &DT) >= 16) { - Type *OpPtrTy = - PointerType::getUnqual(II->getArgOperand(0)->getType()); - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); - return new StoreInst(II->getArgOperand(0), Ptr, false, Align(16)); - } - break; - case Intrinsic::ppc_vsx_stxvw4x: - case Intrinsic::ppc_vsx_stxvd2x: { - // Turn PPC VSX stores into normal stores. - Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); - return new StoreInst(II->getArgOperand(0), Ptr, false, Align(1)); - } - case Intrinsic::ppc_qpx_qvlfs: - // Turn PPC QPX qvlfs -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC, - &DT) >= 16) { - Type *VTy = - VectorType::get(Builder.getFloatTy(), - cast(II->getType())->getElementCount()); - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), - PointerType::getUnqual(VTy)); - Value *Load = Builder.CreateLoad(VTy, Ptr); - return new FPExtInst(Load, II->getType()); - } - break; - case Intrinsic::ppc_qpx_qvlfd: - // Turn PPC QPX qvlfd -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(32), DL, II, &AC, - &DT) >= 32) { - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), - PointerType::getUnqual(II->getType())); - return new LoadInst(II->getType(), Ptr, "", false, Align(32)); - } - break; - case Intrinsic::ppc_qpx_qvstfs: - // Turn PPC QPX qvstfs -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC, - &DT) >= 16) { - Type *VTy = VectorType::get( - Builder.getFloatTy(), - cast(II->getArgOperand(0)->getType())->getElementCount()); - Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy); - Type *OpPtrTy = PointerType::getUnqual(VTy); - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); - return new StoreInst(TOp, Ptr, false, Align(16)); - } - break; - case Intrinsic::ppc_qpx_qvstfd: - // Turn PPC QPX qvstfd -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(32), DL, II, &AC, - &DT) >= 32) { - Type *OpPtrTy = - PointerType::getUnqual(II->getArgOperand(0)->getType()); - Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); - return new StoreInst(II->getArgOperand(0), Ptr, false, Align(32)); - } - break; - - case Intrinsic::x86_bmi_bextr_32: - case Intrinsic::x86_bmi_bextr_64: - case Intrinsic::x86_tbm_bextri_u32: - case Intrinsic::x86_tbm_bextri_u64: - // If the RHS is a constant we can try some simplifications. - if (auto *C = dyn_cast(II->getArgOperand(1))) { - uint64_t Shift = C->getZExtValue(); - uint64_t Length = (Shift >> 8) & 0xff; - Shift &= 0xff; - unsigned BitWidth = II->getType()->getIntegerBitWidth(); - // If the length is 0 or the shift is out of range, replace with zero. - if (Length == 0 || Shift >= BitWidth) - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); - // If the LHS is also a constant, we can completely constant fold this. - if (auto *InC = dyn_cast(II->getArgOperand(0))) { - uint64_t Result = InC->getZExtValue() >> Shift; - if (Length > BitWidth) - Length = BitWidth; - Result &= maskTrailingOnes(Length); - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); - } - // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we - // are only masking bits that a shift already cleared? - } - break; - - case Intrinsic::x86_bmi_bzhi_32: - case Intrinsic::x86_bmi_bzhi_64: - // If the RHS is a constant we can try some simplifications. - if (auto *C = dyn_cast(II->getArgOperand(1))) { - uint64_t Index = C->getZExtValue() & 0xff; - unsigned BitWidth = II->getType()->getIntegerBitWidth(); - if (Index >= BitWidth) - return replaceInstUsesWith(CI, II->getArgOperand(0)); - if (Index == 0) - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); - // If the LHS is also a constant, we can completely constant fold this. - if (auto *InC = dyn_cast(II->getArgOperand(0))) { - uint64_t Result = InC->getZExtValue(); - Result &= maskTrailingOnes(Index); - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); - } - // TODO should we convert this to an AND if the RHS is constant? - } - break; - case Intrinsic::x86_bmi_pext_32: - case Intrinsic::x86_bmi_pext_64: - if (auto *MaskC = dyn_cast(II->getArgOperand(1))) { - if (MaskC->isNullValue()) - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); - if (MaskC->isAllOnesValue()) - return replaceInstUsesWith(CI, II->getArgOperand(0)); - - if (auto *SrcC = dyn_cast(II->getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToSet = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToTest = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToSet <<= 1; - // Clear lowest set bit. - Mask &= Mask - 1; - } - - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); - } - } - break; - case Intrinsic::x86_bmi_pdep_32: - case Intrinsic::x86_bmi_pdep_64: - if (auto *MaskC = dyn_cast(II->getArgOperand(1))) { - if (MaskC->isNullValue()) - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); - if (MaskC->isAllOnesValue()) - return replaceInstUsesWith(CI, II->getArgOperand(0)); - - if (auto *SrcC = dyn_cast(II->getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToTest = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToSet = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToTest <<= 1; - // Clear lowest set bit; - Mask &= Mask - 1; - } - - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); - } - } - break; - - case Intrinsic::x86_sse_cvtss2si: - case Intrinsic::x86_sse_cvtss2si64: - case Intrinsic::x86_sse_cvttss2si: - case Intrinsic::x86_sse_cvttss2si64: - case Intrinsic::x86_sse2_cvtsd2si: - case Intrinsic::x86_sse2_cvtsd2si64: - case Intrinsic::x86_sse2_cvttsd2si: - case Intrinsic::x86_sse2_cvttsd2si64: - case Intrinsic::x86_avx512_vcvtss2si32: - case Intrinsic::x86_avx512_vcvtss2si64: - case Intrinsic::x86_avx512_vcvtss2usi32: - case Intrinsic::x86_avx512_vcvtss2usi64: - case Intrinsic::x86_avx512_vcvtsd2si32: - case Intrinsic::x86_avx512_vcvtsd2si64: - case Intrinsic::x86_avx512_vcvtsd2usi32: - case Intrinsic::x86_avx512_vcvtsd2usi64: - case Intrinsic::x86_avx512_cvttss2si: - case Intrinsic::x86_avx512_cvttss2si64: - case Intrinsic::x86_avx512_cvttss2usi: - case Intrinsic::x86_avx512_cvttss2usi64: - case Intrinsic::x86_avx512_cvttsd2si: - case Intrinsic::x86_avx512_cvttsd2si64: - case Intrinsic::x86_avx512_cvttsd2usi: - case Intrinsic::x86_avx512_cvttsd2usi64: { - // These intrinsics only demand the 0th element of their input vectors. If - // we can simplify the input based on that, do so now. - Value *Arg = II->getArgOperand(0); - unsigned VWidth = cast(Arg->getType())->getNumElements(); - if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) - return replaceOperand(*II, 0, V); - break; - } - - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_avx2_pmovmskb: - if (Value *V = simplifyX86movmsk(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_comieq_sd: - case Intrinsic::x86_sse2_comige_sd: - case Intrinsic::x86_sse2_comigt_sd: - case Intrinsic::x86_sse2_comile_sd: - case Intrinsic::x86_sse2_comilt_sd: - case Intrinsic::x86_sse2_comineq_sd: - case Intrinsic::x86_sse2_ucomieq_sd: - case Intrinsic::x86_sse2_ucomige_sd: - case Intrinsic::x86_sse2_ucomigt_sd: - case Intrinsic::x86_sse2_ucomile_sd: - case Intrinsic::x86_sse2_ucomilt_sd: - case Intrinsic::x86_sse2_ucomineq_sd: - case Intrinsic::x86_avx512_vcomi_ss: - case Intrinsic::x86_avx512_vcomi_sd: - case Intrinsic::x86_avx512_mask_cmp_ss: - case Intrinsic::x86_avx512_mask_cmp_sd: { - // These intrinsics only demand the 0th element of their input vectors. If - // we can simplify the input based on that, do so now. - bool MadeChange = false; - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - unsigned VWidth = cast(Arg0->getType())->getNumElements(); - if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { - replaceOperand(*II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { - replaceOperand(*II, 1, V); - MadeChange = true; - } - if (MadeChange) - return II; - break; - } - case Intrinsic::x86_avx512_cmp_pd_128: - case Intrinsic::x86_avx512_cmp_pd_256: - case Intrinsic::x86_avx512_cmp_pd_512: - case Intrinsic::x86_avx512_cmp_ps_128: - case Intrinsic::x86_avx512_cmp_ps_256: - case Intrinsic::x86_avx512_cmp_ps_512: { - // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a) - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - bool Arg0IsZero = match(Arg0, m_PosZeroFP()); - if (Arg0IsZero) - std::swap(Arg0, Arg1); - Value *A, *B; - // This fold requires only the NINF(not +/- inf) since inf minus - // inf is nan. - // NSZ(No Signed Zeros) is not needed because zeros of any sign are - // equal for both compares. - // NNAN is not needed because nans compare the same for both compares. - // The compare intrinsic uses the above assumptions and therefore - // doesn't require additional flags. - if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) && - match(Arg1, m_PosZeroFP()) && isa(Arg0) && - cast(Arg0)->getFastMathFlags().noInfs())) { - if (Arg0IsZero) - std::swap(A, B); - replaceOperand(*II, 0, A); - replaceOperand(*II, 1, B); - return II; - } - break; - } - - case Intrinsic::x86_avx512_add_ps_512: - case Intrinsic::x86_avx512_div_ps_512: - case Intrinsic::x86_avx512_mul_ps_512: - case Intrinsic::x86_avx512_sub_ps_512: - case Intrinsic::x86_avx512_add_pd_512: - case Intrinsic::x86_avx512_div_pd_512: - case Intrinsic::x86_avx512_mul_pd_512: - case Intrinsic::x86_avx512_sub_pd_512: - // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. - if (auto *R = dyn_cast(II->getArgOperand(2))) { - if (R->getValue() == 4) { - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - - Value *V; - switch (IID) { - default: llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_avx512_add_ps_512: - case Intrinsic::x86_avx512_add_pd_512: - V = Builder.CreateFAdd(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_sub_ps_512: - case Intrinsic::x86_avx512_sub_pd_512: - V = Builder.CreateFSub(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_mul_ps_512: - case Intrinsic::x86_avx512_mul_pd_512: - V = Builder.CreateFMul(Arg0, Arg1); - break; - case Intrinsic::x86_avx512_div_ps_512: - case Intrinsic::x86_avx512_div_pd_512: - V = Builder.CreateFDiv(Arg0, Arg1); - break; - } - - return replaceInstUsesWith(*II, V); - } - } - break; - - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. - if (auto *R = dyn_cast(II->getArgOperand(4))) { - if (R->getValue() == 4) { - // Extract the element as scalars. - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0); - Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0); - - Value *V; - switch (IID) { - default: llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - V = Builder.CreateFAdd(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - V = Builder.CreateFSub(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - V = Builder.CreateFMul(LHS, RHS); - break; - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - V = Builder.CreateFDiv(LHS, RHS); - break; - } - - // Handle the masking aspect of the intrinsic. - Value *Mask = II->getArgOperand(3); - auto *C = dyn_cast(Mask); - // We don't need a select if we know the mask bit is a 1. - if (!C || !C->getValue()[0]) { - // Cast the mask to an i1 vector and then extract the lowest element. - auto *MaskTy = FixedVectorType::get( - Builder.getInt1Ty(), - cast(Mask->getType())->getBitWidth()); - Mask = Builder.CreateBitCast(Mask, MaskTy); - Mask = Builder.CreateExtractElement(Mask, (uint64_t)0); - // Extract the lowest element from the passthru operand. - Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2), - (uint64_t)0); - V = Builder.CreateSelect(Mask, V, Passthru); - } - - // Insert the result back into the original argument 0. - V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0); - - return replaceInstUsesWith(*II, V); - } - } - break; - - // Constant fold ashr( , Ci ). - // Constant fold lshr( , Ci ). - // Constant fold shl( , Ci ). - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psrai_d_512: - case Intrinsic::x86_avx512_psrai_q_512: - case Intrinsic::x86_avx512_psrai_w_512: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrli_d_512: - case Intrinsic::x86_avx512_psrli_q_512: - case Intrinsic::x86_avx512_psrli_w_512: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_pslli_d_512: - case Intrinsic::x86_avx512_pslli_q_512: - case Intrinsic::x86_avx512_pslli_w_512: - if (Value *V = simplifyX86immShift(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx512_psra_q_128: - case Intrinsic::x86_avx512_psra_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: { - if (Value *V = simplifyX86immShift(*II, Builder)) - return replaceInstUsesWith(*II, V); - - // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector - // operand to compute the shift amount. - Value *Arg1 = II->getArgOperand(1); - assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && - "Unexpected packed shift size"); - unsigned VWidth = cast(Arg1->getType())->getNumElements(); - - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) - return replaceOperand(*II, 1, V); - break; - } - - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx512_psllv_d_512: - case Intrinsic::x86_avx512_psllv_q_512: - case Intrinsic::x86_avx512_psllv_w_128: - case Intrinsic::x86_avx512_psllv_w_256: - case Intrinsic::x86_avx512_psllv_w_512: - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - case Intrinsic::x86_avx512_psrav_q_128: - case Intrinsic::x86_avx512_psrav_q_256: - case Intrinsic::x86_avx512_psrav_d_512: - case Intrinsic::x86_avx512_psrav_q_512: - case Intrinsic::x86_avx512_psrav_w_128: - case Intrinsic::x86_avx512_psrav_w_256: - case Intrinsic::x86_avx512_psrav_w_512: - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx512_psrlv_d_512: - case Intrinsic::x86_avx512_psrlv_q_512: - case Intrinsic::x86_avx512_psrlv_w_128: - case Intrinsic::x86_avx512_psrlv_w_256: - case Intrinsic::x86_avx512_psrlv_w_512: - if (Value *V = simplifyX86varShift(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - if (Value *V = simplifyX86pack(*II, Builder, true)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: - if (Value *V = simplifyX86pack(*II, Builder, false)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_pclmulqdq: - case Intrinsic::x86_pclmulqdq_256: - case Intrinsic::x86_pclmulqdq_512: { - if (auto *C = dyn_cast(II->getArgOperand(2))) { - unsigned Imm = C->getZExtValue(); - - bool MadeChange = false; - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - unsigned VWidth = cast(Arg0->getType())->getNumElements(); - - APInt UndefElts1(VWidth, 0); - APInt DemandedElts1 = APInt::getSplat(VWidth, - APInt(2, (Imm & 0x01) ? 2 : 1)); - if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1, - UndefElts1)) { - replaceOperand(*II, 0, V); - MadeChange = true; - } - - APInt UndefElts2(VWidth, 0); - APInt DemandedElts2 = APInt::getSplat(VWidth, - APInt(2, (Imm & 0x10) ? 2 : 1)); - if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2, - UndefElts2)) { - replaceOperand(*II, 1, V); - MadeChange = true; - } - - // If either input elements are undef, the result is zero. - if (DemandedElts1.isSubsetOf(UndefElts1) || - DemandedElts2.isSubsetOf(UndefElts2)) - return replaceInstUsesWith(*II, - ConstantAggregateZero::get(II->getType())); - - if (MadeChange) - return II; - } - break; - } - - case Intrinsic::x86_sse41_insertps: - if (Value *V = simplifyX86insertps(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_sse4a_extrq: { - Value *Op0 = II->getArgOperand(0); - Value *Op1 = II->getArgOperand(1); - unsigned VWidth0 = cast(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast(Op1->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && - VWidth1 == 16 && "Unexpected operand sizes"); - - // See if we're dealing with constant values. - Constant *C1 = dyn_cast(Op1); - ConstantInt *CILength = - C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)0)) - : nullptr; - ConstantInt *CIIndex = - C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)1)) - : nullptr; - - // Attempt to simplify to a constant, shuffle vector or EXTRQI call. - if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) - return replaceInstUsesWith(*II, V); - - // EXTRQ only uses the lowest 64-bits of the first 128-bit vector - // operands and the lowest 16-bits of the second. - bool MadeChange = false; - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { - replaceOperand(*II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { - replaceOperand(*II, 1, V); - MadeChange = true; - } - if (MadeChange) - return II; - break; - } - - case Intrinsic::x86_sse4a_extrqi: { - // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining - // bits of the lower 64-bits. The upper 64-bits are undefined. - Value *Op0 = II->getArgOperand(0); - unsigned VWidth = cast(Op0->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && - "Unexpected operand size"); - - // See if we're dealing with constant values. - ConstantInt *CILength = dyn_cast(II->getArgOperand(1)); - ConstantInt *CIIndex = dyn_cast(II->getArgOperand(2)); - - // Attempt to simplify to a constant or shuffle vector. - if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder)) - return replaceInstUsesWith(*II, V); - - // EXTRQI only uses the lowest 64-bits of the first 128-bit vector - // operand. - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) - return replaceOperand(*II, 0, V); - break; - } - - case Intrinsic::x86_sse4a_insertq: { - Value *Op0 = II->getArgOperand(0); - Value *Op1 = II->getArgOperand(1); - unsigned VWidth = cast(Op0->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && - cast(Op1->getType())->getNumElements() == 2 && - "Unexpected operand size"); - - // See if we're dealing with constant values. - Constant *C1 = dyn_cast(Op1); - ConstantInt *CI11 = - C1 ? dyn_cast_or_null(C1->getAggregateElement((unsigned)1)) - : nullptr; - - // Attempt to simplify to a constant, shuffle vector or INSERTQI call. - if (CI11) { - const APInt &V11 = CI11->getValue(); - APInt Len = V11.zextOrTrunc(6); - APInt Idx = V11.lshr(8).zextOrTrunc(6); - if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) - return replaceInstUsesWith(*II, V); - } - - // INSERTQ only uses the lowest 64-bits of the first 128-bit vector - // operand. - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) - return replaceOperand(*II, 0, V); - break; - } - - case Intrinsic::x86_sse4a_insertqi: { - // INSERTQI: Extract lowest Length bits from lower half of second source and - // insert over first source starting at Index bit. The upper 64-bits are - // undefined. - Value *Op0 = II->getArgOperand(0); - Value *Op1 = II->getArgOperand(1); - unsigned VWidth0 = cast(Op0->getType())->getNumElements(); - unsigned VWidth1 = cast(Op1->getType())->getNumElements(); - assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && - Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && - VWidth1 == 2 && "Unexpected operand sizes"); - - // See if we're dealing with constant values. - ConstantInt *CILength = dyn_cast(II->getArgOperand(2)); - ConstantInt *CIIndex = dyn_cast(II->getArgOperand(3)); - - // Attempt to simplify to a constant or shuffle vector. - if (CILength && CIIndex) { - APInt Len = CILength->getValue().zextOrTrunc(6); - APInt Idx = CIIndex->getValue().zextOrTrunc(6); - if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder)) - return replaceInstUsesWith(*II, V); - } - - // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector - // operands. - bool MadeChange = false; - if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { - replaceOperand(*II, 0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { - replaceOperand(*II, 1, V); - MadeChange = true; - } - if (MadeChange) - return II; - break; - } - - case Intrinsic::x86_sse41_pblendvb: - case Intrinsic::x86_sse41_blendvps: - case Intrinsic::x86_sse41_blendvpd: - case Intrinsic::x86_avx_blendv_ps_256: - case Intrinsic::x86_avx_blendv_pd_256: - case Intrinsic::x86_avx2_pblendvb: { - // fold (blend A, A, Mask) -> A - Value *Op0 = II->getArgOperand(0); - Value *Op1 = II->getArgOperand(1); - Value *Mask = II->getArgOperand(2); - if (Op0 == Op1) - return replaceInstUsesWith(CI, Op0); - - // Zero Mask - select 1st argument. - if (isa(Mask)) - return replaceInstUsesWith(CI, Op0); - - // Constant Mask - select 1st/2nd argument lane based on top bit of mask. - if (auto *ConstantMask = dyn_cast(Mask)) { - Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); - return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); - } - - // Convert to a vector select if we can bypass casts and find a boolean - // vector condition value. - Value *BoolVec; - Mask = peekThroughBitcast(Mask); - if (match(Mask, m_SExt(m_Value(BoolVec))) && - BoolVec->getType()->isVectorTy() && - BoolVec->getType()->getScalarSizeInBits() == 1) { - assert(Mask->getType()->getPrimitiveSizeInBits() == - II->getType()->getPrimitiveSizeInBits() && - "Not expecting mask and operands with different sizes"); - - unsigned NumMaskElts = - cast(Mask->getType())->getNumElements(); - unsigned NumOperandElts = - cast(II->getType())->getNumElements(); - if (NumMaskElts == NumOperandElts) - return SelectInst::Create(BoolVec, Op1, Op0); - - // If the mask has less elements than the operands, each mask bit maps to - // multiple elements of the operands. Bitcast back and forth. - if (NumMaskElts < NumOperandElts) { - Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType()); - Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType()); - Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0); - return new BitCastInst(Sel, II->getType()); - } - } - - break; - } - - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: - if (Value *V = simplifyX86pshufb(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_avx_vpermilvar_ps: - case Intrinsic::x86_avx_vpermilvar_ps_256: - case Intrinsic::x86_avx512_vpermilvar_ps_512: - case Intrinsic::x86_avx_vpermilvar_pd: - case Intrinsic::x86_avx_vpermilvar_pd_256: - case Intrinsic::x86_avx512_vpermilvar_pd_512: - if (Value *V = simplifyX86vpermilvar(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: - case Intrinsic::x86_avx512_permvar_df_256: - case Intrinsic::x86_avx512_permvar_df_512: - case Intrinsic::x86_avx512_permvar_di_256: - case Intrinsic::x86_avx512_permvar_di_512: - case Intrinsic::x86_avx512_permvar_hi_128: - case Intrinsic::x86_avx512_permvar_hi_256: - case Intrinsic::x86_avx512_permvar_hi_512: - case Intrinsic::x86_avx512_permvar_qi_128: - case Intrinsic::x86_avx512_permvar_qi_256: - case Intrinsic::x86_avx512_permvar_qi_512: - case Intrinsic::x86_avx512_permvar_sf_512: - case Intrinsic::x86_avx512_permvar_si_512: - if (Value *V = simplifyX86vpermv(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::x86_avx_maskload_ps: - case Intrinsic::x86_avx_maskload_pd: - case Intrinsic::x86_avx_maskload_ps_256: - case Intrinsic::x86_avx_maskload_pd_256: - case Intrinsic::x86_avx2_maskload_d: - case Intrinsic::x86_avx2_maskload_q: - case Intrinsic::x86_avx2_maskload_d_256: - case Intrinsic::x86_avx2_maskload_q_256: - if (Instruction *I = simplifyX86MaskedLoad(*II, *this)) - return I; - break; - - case Intrinsic::x86_sse2_maskmov_dqu: - case Intrinsic::x86_avx_maskstore_ps: - case Intrinsic::x86_avx_maskstore_pd: - case Intrinsic::x86_avx_maskstore_ps_256: - case Intrinsic::x86_avx_maskstore_pd_256: - case Intrinsic::x86_avx2_maskstore_d: - case Intrinsic::x86_avx2_maskstore_q: - case Intrinsic::x86_avx2_maskstore_d_256: - case Intrinsic::x86_avx2_maskstore_q_256: - if (simplifyX86MaskedStore(*II, *this)) - return nullptr; - break; - - case Intrinsic::x86_addcarry_32: - case Intrinsic::x86_addcarry_64: - if (Value *V = simplifyX86addcarry(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - - case Intrinsic::ppc_altivec_vperm: - // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. - // Note that ppc_altivec_vperm has a big-endian bias, so when creating - // a vectorshuffle for little endian, we must undo the transformation - // performed on vec_perm in altivec.h. That is, we must complement - // the permutation mask with respect to 31 and reverse the order of - // V1 and V2. - if (Constant *Mask = dyn_cast(II->getArgOperand(2))) { - assert(cast(Mask->getType())->getNumElements() == 16 && - "Bad type for intrinsic!"); - - // Check that all of the elements are integer constants or undefs. - bool AllEltsOk = true; - for (unsigned i = 0; i != 16; ++i) { - Constant *Elt = Mask->getAggregateElement(i); - if (!Elt || !(isa(Elt) || isa(Elt))) { - AllEltsOk = false; - break; - } - } - - if (AllEltsOk) { - // Cast the input vectors to byte vectors. - Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0), - Mask->getType()); - Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1), - Mask->getType()); - Value *Result = UndefValue::get(Op0->getType()); - - // Only extract each element once. - Value *ExtractedElts[32]; - memset(ExtractedElts, 0, sizeof(ExtractedElts)); - - for (unsigned i = 0; i != 16; ++i) { - if (isa(Mask->getAggregateElement(i))) - continue; - unsigned Idx = - cast(Mask->getAggregateElement(i))->getZExtValue(); - Idx &= 31; // Match the hardware behavior. - if (DL.isLittleEndian()) - Idx = 31 - Idx; - - if (!ExtractedElts[Idx]) { - Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; - Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; - ExtractedElts[Idx] = - Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse, - Builder.getInt32(Idx&15)); - } - - // Insert this value into the result vector. - Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx], - Builder.getInt32(i)); - } - return CastInst::Create(Instruction::BitCast, Result, CI.getType()); - } - } - break; - - case Intrinsic::arm_neon_vld1: { - Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); - if (Value *V = simplifyNeonVld1(*II, MemAlign.value(), Builder)) - return replaceInstUsesWith(*II, V); - break; - } - - case Intrinsic::arm_neon_vld2: - case Intrinsic::arm_neon_vld3: - case Intrinsic::arm_neon_vld4: - case Intrinsic::arm_neon_vld2lane: - case Intrinsic::arm_neon_vld3lane: - case Intrinsic::arm_neon_vld4lane: - case Intrinsic::arm_neon_vst1: - case Intrinsic::arm_neon_vst2: - case Intrinsic::arm_neon_vst3: - case Intrinsic::arm_neon_vst4: - case Intrinsic::arm_neon_vst2lane: - case Intrinsic::arm_neon_vst3lane: - case Intrinsic::arm_neon_vst4lane: { - Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); - unsigned AlignArg = II->getNumArgOperands() - 1; - Value *AlignArgOp = II->getArgOperand(AlignArg); - MaybeAlign Align = cast(AlignArgOp)->getMaybeAlignValue(); - if (Align && *Align < MemAlign) - return replaceOperand(*II, AlignArg, - ConstantInt::get(Type::getInt32Ty(II->getContext()), - MemAlign.value(), false)); - break; - } case Intrinsic::arm_neon_vtbl1: case Intrinsic::aarch64_neon_tbl1: @@ -3453,690 +1326,6 @@ } break; } - case Intrinsic::arm_mve_pred_i2v: { - Value *Arg = II->getArgOperand(0); - Value *ArgArg; - if (match(Arg, m_Intrinsic(m_Value(ArgArg))) && - II->getType() == ArgArg->getType()) - return replaceInstUsesWith(*II, ArgArg); - Constant *XorMask; - if (match(Arg, - m_Xor(m_Intrinsic(m_Value(ArgArg)), - m_Constant(XorMask))) && - II->getType() == ArgArg->getType()) { - if (auto *CI = dyn_cast(XorMask)) { - if (CI->getValue().trunc(16).isAllOnesValue()) { - auto TrueVector = Builder.CreateVectorSplat( - cast(II->getType())->getNumElements(), - Builder.getTrue()); - return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); - } - } - } - KnownBits ScalarKnown(32); - if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16), - ScalarKnown, 0)) - return II; - break; - } - case Intrinsic::arm_mve_pred_v2i: { - Value *Arg = II->getArgOperand(0); - Value *ArgArg; - if (match(Arg, m_Intrinsic(m_Value(ArgArg)))) - return replaceInstUsesWith(*II, ArgArg); - if (!II->getMetadata(LLVMContext::MD_range)) { - Type *IntTy32 = Type::getInt32Ty(II->getContext()); - Metadata *M[] = { - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), - ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF)) - }; - II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); - return II; - } - break; - } - case Intrinsic::arm_mve_vadc: - case Intrinsic::arm_mve_vadc_predicated: { - unsigned CarryOp = - (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; - assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && - "Bad type for intrinsic!"); - - KnownBits CarryKnown(32); - if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29), - CarryKnown)) - return II; - break; - } - case Intrinsic::amdgcn_rcp: { - Value *Src = II->getArgOperand(0); - - // TODO: Move to ConstantFolding/InstSimplify? - if (isa(Src)) { - Type *Ty = II->getType(); - auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); - return replaceInstUsesWith(CI, QNaN); - } - - if (II->isStrictFP()) - break; - - if (const ConstantFP *C = dyn_cast(Src)) { - const APFloat &ArgVal = C->getValueAPF(); - APFloat Val(ArgVal.getSemantics(), 1); - Val.divide(ArgVal, APFloat::rmNearestTiesToEven); - - // This is more precise than the instruction may give. - // - // TODO: The instruction always flushes denormal results (except for f16), - // should this also? - return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); - } - - break; - } - case Intrinsic::amdgcn_rsq: { - Value *Src = II->getArgOperand(0); - - // TODO: Move to ConstantFolding/InstSimplify? - if (isa(Src)) { - Type *Ty = II->getType(); - auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); - return replaceInstUsesWith(CI, QNaN); - } - - break; - } - case Intrinsic::amdgcn_frexp_mant: - case Intrinsic::amdgcn_frexp_exp: { - Value *Src = II->getArgOperand(0); - if (const ConstantFP *C = dyn_cast(Src)) { - int Exp; - APFloat Significand = frexp(C->getValueAPF(), Exp, - APFloat::rmNearestTiesToEven); - - if (IID == Intrinsic::amdgcn_frexp_mant) { - return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), - Significand)); - } - - // Match instruction special case behavior. - if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) - Exp = 0; - - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp)); - } - - if (isa(Src)) - return replaceInstUsesWith(CI, UndefValue::get(II->getType())); - - break; - } - case Intrinsic::amdgcn_class: { - enum { - S_NAN = 1 << 0, // Signaling NaN - Q_NAN = 1 << 1, // Quiet NaN - N_INFINITY = 1 << 2, // Negative infinity - N_NORMAL = 1 << 3, // Negative normal - N_SUBNORMAL = 1 << 4, // Negative subnormal - N_ZERO = 1 << 5, // Negative zero - P_ZERO = 1 << 6, // Positive zero - P_SUBNORMAL = 1 << 7, // Positive subnormal - P_NORMAL = 1 << 8, // Positive normal - P_INFINITY = 1 << 9 // Positive infinity - }; - - const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | - N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; - - Value *Src0 = II->getArgOperand(0); - Value *Src1 = II->getArgOperand(1); - const ConstantInt *CMask = dyn_cast(Src1); - if (!CMask) { - if (isa(Src0)) - return replaceInstUsesWith(*II, UndefValue::get(II->getType())); - - if (isa(Src1)) - return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); - break; - } - - uint32_t Mask = CMask->getZExtValue(); - - // If all tests are made, it doesn't matter what the value is. - if ((Mask & FullMask) == FullMask) - return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); - - if ((Mask & FullMask) == 0) - return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); - - if (Mask == (S_NAN | Q_NAN)) { - // Equivalent of isnan. Replace with standard fcmp. - Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0); - FCmp->takeName(II); - return replaceInstUsesWith(*II, FCmp); - } - - if (Mask == (N_ZERO | P_ZERO)) { - // Equivalent of == 0. - Value *FCmp = Builder.CreateFCmpOEQ( - Src0, ConstantFP::get(Src0->getType(), 0.0)); - - FCmp->takeName(II); - return replaceInstUsesWith(*II, FCmp); - } - - // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other - if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) - return replaceOperand(*II, 1, ConstantInt::get(Src1->getType(), - Mask & ~(S_NAN | Q_NAN))); - - const ConstantFP *CVal = dyn_cast(Src0); - if (!CVal) { - if (isa(Src0)) - return replaceInstUsesWith(*II, UndefValue::get(II->getType())); - - // Clamp mask to used bits - if ((Mask & FullMask) != Mask) { - CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(), - { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } - ); - - NewCall->takeName(II); - return replaceInstUsesWith(*II, NewCall); - } - - break; - } - - const APFloat &Val = CVal->getValueAPF(); - - bool Result = - ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || - ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || - ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || - ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || - ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || - ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || - ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || - ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || - ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || - ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); - - return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); - } - case Intrinsic::amdgcn_cvt_pkrtz: { - Value *Src0 = II->getArgOperand(0); - Value *Src1 = II->getArgOperand(1); - if (const ConstantFP *C0 = dyn_cast(Src0)) { - if (const ConstantFP *C1 = dyn_cast(Src1)) { - const fltSemantics &HalfSem - = II->getType()->getScalarType()->getFltSemantics(); - bool LosesInfo; - APFloat Val0 = C0->getValueAPF(); - APFloat Val1 = C1->getValueAPF(); - Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); - Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); - - Constant *Folded = ConstantVector::get({ - ConstantFP::get(II->getContext(), Val0), - ConstantFP::get(II->getContext(), Val1) }); - return replaceInstUsesWith(*II, Folded); - } - } - - if (isa(Src0) && isa(Src1)) - return replaceInstUsesWith(*II, UndefValue::get(II->getType())); - - break; - } - case Intrinsic::amdgcn_cvt_pknorm_i16: - case Intrinsic::amdgcn_cvt_pknorm_u16: - case Intrinsic::amdgcn_cvt_pk_i16: - case Intrinsic::amdgcn_cvt_pk_u16: { - Value *Src0 = II->getArgOperand(0); - Value *Src1 = II->getArgOperand(1); - - if (isa(Src0) && isa(Src1)) - return replaceInstUsesWith(*II, UndefValue::get(II->getType())); - - break; - } - case Intrinsic::amdgcn_ubfe: - case Intrinsic::amdgcn_sbfe: { - // Decompose simple cases into standard shifts. - Value *Src = II->getArgOperand(0); - if (isa(Src)) - return replaceInstUsesWith(*II, Src); - - unsigned Width; - Type *Ty = II->getType(); - unsigned IntSize = Ty->getIntegerBitWidth(); - - ConstantInt *CWidth = dyn_cast(II->getArgOperand(2)); - if (CWidth) { - Width = CWidth->getZExtValue(); - if ((Width & (IntSize - 1)) == 0) - return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty)); - - // Hardware ignores high bits, so remove those. - if (Width >= IntSize) - return replaceOperand(*II, 2, ConstantInt::get(CWidth->getType(), - Width & (IntSize - 1))); - } - - unsigned Offset; - ConstantInt *COffset = dyn_cast(II->getArgOperand(1)); - if (COffset) { - Offset = COffset->getZExtValue(); - if (Offset >= IntSize) - return replaceOperand(*II, 1, ConstantInt::get(COffset->getType(), - Offset & (IntSize - 1))); - } - - bool Signed = IID == Intrinsic::amdgcn_sbfe; - - if (!CWidth || !COffset) - break; - - // The case of Width == 0 is handled above, which makes this tranformation - // safe. If Width == 0, then the ashr and lshr instructions become poison - // value since the shift amount would be equal to the bit size. - assert(Width != 0); - - // TODO: This allows folding to undef when the hardware has specific - // behavior? - if (Offset + Width < IntSize) { - Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width); - Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width) - : Builder.CreateLShr(Shl, IntSize - Width); - RightShift->takeName(II); - return replaceInstUsesWith(*II, RightShift); - } - - Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset) - : Builder.CreateLShr(Src, Offset); - - RightShift->takeName(II); - return replaceInstUsesWith(*II, RightShift); - } - case Intrinsic::amdgcn_exp: - case Intrinsic::amdgcn_exp_compr: { - ConstantInt *En = cast(II->getArgOperand(1)); - unsigned EnBits = En->getZExtValue(); - if (EnBits == 0xf) - break; // All inputs enabled. - - bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; - bool Changed = false; - for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { - if ((!IsCompr && (EnBits & (1 << I)) == 0) || - (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { - Value *Src = II->getArgOperand(I + 2); - if (!isa(Src)) { - replaceOperand(*II, I + 2, UndefValue::get(Src->getType())); - Changed = true; - } - } - } - - if (Changed) - return II; - - break; - } - case Intrinsic::amdgcn_fmed3: { - // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled - // for the shader. - - Value *Src0 = II->getArgOperand(0); - Value *Src1 = II->getArgOperand(1); - Value *Src2 = II->getArgOperand(2); - - // Checking for NaN before canonicalization provides better fidelity when - // mapping other operations onto fmed3 since the order of operands is - // unchanged. - CallInst *NewCall = nullptr; - if (match(Src0, m_NaN()) || isa(Src0)) { - NewCall = Builder.CreateMinNum(Src1, Src2); - } else if (match(Src1, m_NaN()) || isa(Src1)) { - NewCall = Builder.CreateMinNum(Src0, Src2); - } else if (match(Src2, m_NaN()) || isa(Src2)) { - NewCall = Builder.CreateMaxNum(Src0, Src1); - } - - if (NewCall) { - NewCall->copyFastMathFlags(II); - NewCall->takeName(II); - return replaceInstUsesWith(*II, NewCall); - } - - bool Swap = false; - // Canonicalize constants to RHS operands. - // - // fmed3(c0, x, c1) -> fmed3(x, c0, c1) - if (isa(Src0) && !isa(Src1)) { - std::swap(Src0, Src1); - Swap = true; - } - - if (isa(Src1) && !isa(Src2)) { - std::swap(Src1, Src2); - Swap = true; - } - - if (isa(Src0) && !isa(Src1)) { - std::swap(Src0, Src1); - Swap = true; - } - - if (Swap) { - II->setArgOperand(0, Src0); - II->setArgOperand(1, Src1); - II->setArgOperand(2, Src2); - return II; - } - - if (const ConstantFP *C0 = dyn_cast(Src0)) { - if (const ConstantFP *C1 = dyn_cast(Src1)) { - if (const ConstantFP *C2 = dyn_cast(Src2)) { - APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), - C2->getValueAPF()); - return replaceInstUsesWith(*II, - ConstantFP::get(Builder.getContext(), Result)); - } - } - } - - break; - } - case Intrinsic::amdgcn_icmp: - case Intrinsic::amdgcn_fcmp: { - const ConstantInt *CC = cast(II->getArgOperand(2)); - // Guard against invalid arguments. - int64_t CCVal = CC->getZExtValue(); - bool IsInteger = IID == Intrinsic::amdgcn_icmp; - if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || - CCVal > CmpInst::LAST_ICMP_PREDICATE)) || - (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || - CCVal > CmpInst::LAST_FCMP_PREDICATE))) - break; - - Value *Src0 = II->getArgOperand(0); - Value *Src1 = II->getArgOperand(1); - - if (auto *CSrc0 = dyn_cast(Src0)) { - if (auto *CSrc1 = dyn_cast(Src1)) { - Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); - if (CCmp->isNullValue()) { - return replaceInstUsesWith( - *II, ConstantExpr::getSExt(CCmp, II->getType())); - } - - // The result of V_ICMP/V_FCMP assembly instructions (which this - // intrinsic exposes) is one bit per thread, masked with the EXEC - // register (which contains the bitmask of live threads). So a - // comparison that always returns true is the same as a read of the - // EXEC register. - Function *NewF = Intrinsic::getDeclaration( - II->getModule(), Intrinsic::read_register, II->getType()); - Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")}; - MDNode *MD = MDNode::get(II->getContext(), MDArgs); - Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; - CallInst *NewCall = Builder.CreateCall(NewF, Args); - NewCall->addAttribute(AttributeList::FunctionIndex, - Attribute::Convergent); - NewCall->takeName(II); - return replaceInstUsesWith(*II, NewCall); - } - - // Canonicalize constants to RHS. - CmpInst::Predicate SwapPred - = CmpInst::getSwappedPredicate(static_cast(CCVal)); - II->setArgOperand(0, Src1); - II->setArgOperand(1, Src0); - II->setArgOperand(2, ConstantInt::get(CC->getType(), - static_cast(SwapPred))); - return II; - } - - if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) - break; - - // Canonicalize compare eq with true value to compare != 0 - // llvm.amdgcn.icmp(zext (i1 x), 1, eq) - // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) - // llvm.amdgcn.icmp(sext (i1 x), -1, eq) - // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) - Value *ExtSrc; - if (CCVal == CmpInst::ICMP_EQ && - ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) || - (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) && - ExtSrc->getType()->isIntegerTy(1)) { - replaceOperand(*II, 1, ConstantInt::getNullValue(Src1->getType())); - replaceOperand(*II, 2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); - return II; - } - - CmpInst::Predicate SrcPred; - Value *SrcLHS; - Value *SrcRHS; - - // Fold compare eq/ne with 0 from a compare result as the predicate to the - // intrinsic. The typical use is a wave vote function in the library, which - // will be fed from a user code condition compared with 0. Fold in the - // redundant compare. - - // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) - // -> llvm.amdgcn.[if]cmp(a, b, pred) - // - // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) - // -> llvm.amdgcn.[if]cmp(a, b, inv pred) - if (match(Src1, m_Zero()) && - match(Src0, - m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) { - if (CCVal == CmpInst::ICMP_EQ) - SrcPred = CmpInst::getInversePredicate(SrcPred); - - Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ? - Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp; - - Type *Ty = SrcLHS->getType(); - if (auto *CmpType = dyn_cast(Ty)) { - // Promote to next legal integer type. - unsigned Width = CmpType->getBitWidth(); - unsigned NewWidth = Width; - - // Don't do anything for i1 comparisons. - if (Width == 1) - break; - - if (Width <= 16) - NewWidth = 16; - else if (Width <= 32) - NewWidth = 32; - else if (Width <= 64) - NewWidth = 64; - else if (Width > 64) - break; // Can't handle this. - - if (Width != NewWidth) { - IntegerType *CmpTy = Builder.getIntNTy(NewWidth); - if (CmpInst::isSigned(SrcPred)) { - SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy); - SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy); - } else { - SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy); - SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy); - } - } - } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) - break; - - Function *NewF = - Intrinsic::getDeclaration(II->getModule(), NewIID, - { II->getType(), - SrcLHS->getType() }); - Value *Args[] = { SrcLHS, SrcRHS, - ConstantInt::get(CC->getType(), SrcPred) }; - CallInst *NewCall = Builder.CreateCall(NewF, Args); - NewCall->takeName(II); - return replaceInstUsesWith(*II, NewCall); - } - - break; - } - case Intrinsic::amdgcn_ballot: { - if (auto *Src = dyn_cast(II->getArgOperand(0))) { - if (Src->isZero()) { - // amdgcn.ballot(i1 0) is zero. - return replaceInstUsesWith(*II, Constant::getNullValue(II->getType())); - } - - if (Src->isOne()) { - // amdgcn.ballot(i1 1) is exec. - const char *RegName = "exec"; - if (II->getType()->isIntegerTy(32)) - RegName = "exec_lo"; - else if (!II->getType()->isIntegerTy(64)) - break; - - Function *NewF = Intrinsic::getDeclaration( - II->getModule(), Intrinsic::read_register, II->getType()); - Metadata *MDArgs[] = {MDString::get(II->getContext(), RegName)}; - MDNode *MD = MDNode::get(II->getContext(), MDArgs); - Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)}; - CallInst *NewCall = Builder.CreateCall(NewF, Args); - NewCall->addAttribute(AttributeList::FunctionIndex, - Attribute::Convergent); - NewCall->takeName(II); - return replaceInstUsesWith(*II, NewCall); - } - } - break; - } - case Intrinsic::amdgcn_wqm_vote: { - // wqm_vote is identity when the argument is constant. - if (!isa(II->getArgOperand(0))) - break; - - return replaceInstUsesWith(*II, II->getArgOperand(0)); - } - case Intrinsic::amdgcn_kill: { - const ConstantInt *C = dyn_cast(II->getArgOperand(0)); - if (!C || !C->getZExtValue()) - break; - - // amdgcn.kill(i1 1) is a no-op - return eraseInstFromFunction(CI); - } - case Intrinsic::amdgcn_update_dpp: { - Value *Old = II->getArgOperand(0); - - auto BC = cast(II->getArgOperand(5)); - auto RM = cast(II->getArgOperand(3)); - auto BM = cast(II->getArgOperand(4)); - if (BC->isZeroValue() || - RM->getZExtValue() != 0xF || - BM->getZExtValue() != 0xF || - isa(Old)) - break; - - // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. - return replaceOperand(*II, 0, UndefValue::get(Old->getType())); - } - case Intrinsic::amdgcn_permlane16: - case Intrinsic::amdgcn_permlanex16: { - // Discard vdst_in if it's not going to be read. - Value *VDstIn = II->getArgOperand(0); - if (isa(VDstIn)) - break; - - ConstantInt *FetchInvalid = cast(II->getArgOperand(4)); - ConstantInt *BoundCtrl = cast(II->getArgOperand(5)); - if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) - break; - - return replaceOperand(*II, 0, UndefValue::get(VDstIn->getType())); - } - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: { - // A constant value is trivially uniform. - if (Constant *C = dyn_cast(II->getArgOperand(0))) - return replaceInstUsesWith(*II, C); - - // The rest of these may not be safe if the exec may not be the same between - // the def and use. - Value *Src = II->getArgOperand(0); - Instruction *SrcInst = dyn_cast(Src); - if (SrcInst && SrcInst->getParent() != II->getParent()) - break; - - // readfirstlane (readfirstlane x) -> readfirstlane x - // readlane (readfirstlane x), y -> readfirstlane x - if (match(Src, m_Intrinsic())) - return replaceInstUsesWith(*II, Src); - - if (IID == Intrinsic::amdgcn_readfirstlane) { - // readfirstlane (readlane x, y) -> readlane x, y - if (match(Src, m_Intrinsic())) - return replaceInstUsesWith(*II, Src); - } else { - // readlane (readlane x, y), y -> readlane x, y - if (match(Src, m_Intrinsic( - m_Value(), m_Specific(II->getArgOperand(1))))) - return replaceInstUsesWith(*II, Src); - } - - break; - } - case Intrinsic::amdgcn_ldexp: { - // FIXME: This doesn't introduce new instructions and belongs in - // InstructionSimplify. - Type *Ty = II->getType(); - Value *Op0 = II->getArgOperand(0); - Value *Op1 = II->getArgOperand(1); - - // Folding undef to qnan is safe regardless of the FP mode. - if (isa(Op0)) { - auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); - return replaceInstUsesWith(*II, QNaN); - } - - const APFloat *C = nullptr; - match(Op0, m_APFloat(C)); - - // FIXME: Should flush denorms depending on FP mode, but that's ignored - // everywhere else. - // - // These cases should be safe, even with strictfp. - // ldexp(0.0, x) -> 0.0 - // ldexp(-0.0, x) -> -0.0 - // ldexp(inf, x) -> inf - // ldexp(-inf, x) -> -inf - if (C && (C->isZero() || C->isInfinity())) - return replaceInstUsesWith(*II, Op0); - - // With strictfp, be more careful about possibly needing to flush denormals - // or not, and snan behavior depends on ieee_mode. - if (II->isStrictFP()) - break; - - if (C && C->isNaN()) { - // FIXME: We just need to make the nan quiet here, but that's unavailable - // on APFloat, only IEEEfloat - auto *Quieted = ConstantFP::get( - Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); - return replaceInstUsesWith(*II, Quieted); - } - - // ldexp(x, 0) -> x - // ldexp(x, undef) -> x - if (isa(Op1) || match(Op1, m_ZeroInt())) - return replaceInstUsesWith(*II, Op0); - - break; - } case Intrinsic::hexagon_V6_vandvrt: case Intrinsic::hexagon_V6_vandvrt_128B: { // Simplify Q -> V -> Q conversion. @@ -4360,12 +1549,19 @@ } break; } + default: { + // Handle target specific intrinsics + Optional V = targetInstCombineIntrinsic(*II); + if (V.hasValue()) + return V.getValue(); + break; + } } return visitCallBase(*II); } // Fence instruction simplification -Instruction *InstCombiner::visitFenceInst(FenceInst &FI) { +Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) { // Remove identical consecutive fences. Instruction *Next = FI.getNextNonDebugInstruction(); if (auto *NFI = dyn_cast(Next)) @@ -4375,12 +1571,12 @@ } // InvokeInst simplification -Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { +Instruction *InstCombinerImpl::visitInvokeInst(InvokeInst &II) { return visitCallBase(II); } // CallBrInst simplification -Instruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) { +Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) { return visitCallBase(CBI); } @@ -4420,7 +1616,7 @@ return true; } -Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { +Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { if (!CI->getCalledFunction()) return nullptr; auto InstCombineRAUW = [this](Instruction *From, Value *With) { @@ -4577,7 +1773,7 @@ } /// Improvements for call, callbr and invoke instructions. -Instruction *InstCombiner::visitCallBase(CallBase &Call) { +Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { if (isAllocationFn(&Call, &TLI)) annotateAnyAllocSite(Call, &TLI); @@ -4728,7 +1924,7 @@ /// If the callee is a constexpr cast of a function, attempt to move the cast to /// the arguments of the call/callbr/invoke. -bool InstCombiner::transformConstExprCastCall(CallBase &Call) { +bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { auto *Callee = dyn_cast(Call.getCalledOperand()->stripPointerCasts()); if (!Callee) @@ -5012,8 +2208,8 @@ /// Turn a call to a function created by init_trampoline / adjust_trampoline /// intrinsic pair into a direct call to the underlying function. Instruction * -InstCombiner::transformCallThroughTrampoline(CallBase &Call, - IntrinsicInst &Tramp) { +InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call, + IntrinsicInst &Tramp) { Value *Callee = Call.getCalledOperand(); Type *CalleeTy = Callee->getType(); FunctionType *FTy = Call.getFunctionType(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -14,10 +14,11 @@ #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include using namespace llvm; using namespace PatternMatch; @@ -81,8 +82,8 @@ /// If we find a cast of an allocation instruction, try to eliminate the cast by /// moving the type information into the alloc. -Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, - AllocaInst &AI) { +Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI, + AllocaInst &AI) { PointerType *PTy = cast(CI.getType()); IRBuilderBase::InsertPointGuard Guard(Builder); @@ -160,8 +161,8 @@ /// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns /// true for, actually insert the code to evaluate the expression. -Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, - bool isSigned) { +Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty, + bool isSigned) { if (Constant *C = dyn_cast(V)) { C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/); // If we got a constantexpr back, try to simplify it with DL info. @@ -229,8 +230,9 @@ return InsertNewInstWith(Res, *I); } -Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1, - const CastInst *CI2) { +Instruction::CastOps +InstCombinerImpl::isEliminableCastPair(const CastInst *CI1, + const CastInst *CI2) { Type *SrcTy = CI1->getSrcTy(); Type *MidTy = CI1->getDestTy(); Type *DstTy = CI2->getDestTy(); @@ -257,7 +259,7 @@ } /// Implement the transforms common to all CastInst visitors. -Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { +Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) { Value *Src = CI.getOperand(0); // Try to eliminate a cast of a cast. @@ -342,7 +344,7 @@ /// /// This function works on both vectors and scalars. /// -static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, +static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC, Instruction *CxtI) { if (canAlwaysEvaluateInType(V, Ty)) return true; @@ -459,7 +461,8 @@ /// trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32 /// ---> /// extractelement <4 x i32> %X, 1 -static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) { +static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, + InstCombinerImpl &IC) { Value *TruncOp = Trunc.getOperand(0); Type *DestType = Trunc.getType(); if (!TruncOp->hasOneUse() || !isa(DestType)) @@ -498,7 +501,7 @@ /// Rotate left/right may occur in a wider type than necessary because of type /// promotion rules. Try to narrow the inputs and convert to funnel shift. -Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) { +Instruction *InstCombinerImpl::narrowRotate(TruncInst &Trunc) { assert((isa(Trunc.getSrcTy()) || shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) && "Don't narrow to an illegal scalar type"); @@ -582,7 +585,7 @@ /// Try to narrow the width of math or bitwise logic instructions by pulling a /// truncate ahead of binary operators. /// TODO: Transforms for truncated shifts should be moved into here. -Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) { +Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) { Type *SrcTy = Trunc.getSrcTy(); Type *DestTy = Trunc.getType(); if (!isa(SrcTy) && !shouldChangeType(SrcTy, DestTy)) @@ -687,7 +690,7 @@ return nullptr; } -Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) { +Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { if (Instruction *Result = commonCastTransforms(Trunc)) return Result; @@ -894,8 +897,8 @@ return nullptr; } -Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext, - bool DoTransform) { +Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext, + bool DoTransform) { // If we are just checking for a icmp eq of a single bit and zext'ing it // to an integer, then shift the bit to the appropriate place and then // cast to integer to avoid the comparison. @@ -1031,7 +1034,7 @@ /// /// This function works on both vectors and scalars. static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, - InstCombiner &IC, Instruction *CxtI) { + InstCombinerImpl &IC, Instruction *CxtI) { BitsToClear = 0; if (canAlwaysEvaluateInType(V, Ty)) return true; @@ -1136,7 +1139,7 @@ } } -Instruction *InstCombiner::visitZExt(ZExtInst &CI) { +Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) { // If this zero extend is only used by a truncate, let the truncate be // eliminated before we try to optimize this zext. if (CI.hasOneUse() && isa(CI.user_back())) @@ -1274,7 +1277,8 @@ } /// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp. -Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { +Instruction *InstCombinerImpl::transformSExtICmp(ICmpInst *ICI, + Instruction &CI) { Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1); ICmpInst::Predicate Pred = ICI->getPredicate(); @@ -1410,7 +1414,7 @@ return false; } -Instruction *InstCombiner::visitSExt(SExtInst &CI) { +Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) { // If this sign extend is only used by a truncate, let the truncate be // eliminated before we try to optimize this sext. if (CI.hasOneUse() && isa(CI.user_back())) @@ -1497,7 +1501,6 @@ return nullptr; } - /// Return a Constant* for the specified floating-point constant if it fits /// in the specified FP type without changing its value. static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { @@ -1616,7 +1619,7 @@ return false; } -Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { +Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { if (Instruction *I = commonCastTransforms(FPT)) return I; @@ -1800,7 +1803,7 @@ return nullptr; } -Instruction *InstCombiner::visitFPExt(CastInst &FPExt) { +Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) { // If the source operand is a cast from integer to FP and known exact, then // cast the integer operand directly to the destination type. Type *Ty = FPExt.getType(); @@ -1818,7 +1821,7 @@ /// This is safe if the intermediate type has enough bits in its mantissa to /// accurately represent all values of X. For example, this won't work with /// i64 -> float -> i64. -Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) { +Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) { if (!isa(FI.getOperand(0)) && !isa(FI.getOperand(0))) return nullptr; @@ -1858,29 +1861,29 @@ return replaceInstUsesWith(FI, X); } -Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) { +Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) { if (Instruction *I = foldItoFPtoI(FI)) return I; return commonCastTransforms(FI); } -Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) { +Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) { if (Instruction *I = foldItoFPtoI(FI)) return I; return commonCastTransforms(FI); } -Instruction *InstCombiner::visitUIToFP(CastInst &CI) { +Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) { return commonCastTransforms(CI); } -Instruction *InstCombiner::visitSIToFP(CastInst &CI) { +Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) { return commonCastTransforms(CI); } -Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { +Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) { // If the source integer type is not the intptr_t type for this target, do a // trunc or zext to the intptr_t type, then inttoptr of it. This allows the // cast to be exposed to other transforms. @@ -1903,7 +1906,7 @@ } /// Implement the transforms for cast of pointer (bitcast/ptrtoint) -Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { +Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) { Value *Src = CI.getOperand(0); if (GetElementPtrInst *GEP = dyn_cast(Src)) { @@ -1925,7 +1928,7 @@ return commonCastTransforms(CI); } -Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { +Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) { // If the destination integer type is not the intptr_t type for this target, // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast // to be exposed to other transforms. @@ -1963,9 +1966,9 @@ /// Try to replace it with a shuffle (and vector/vector bitcast) if possible. /// /// The source and destination vector types may have different element types. -static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal, - VectorType *DestTy, - InstCombiner &IC) { +static Instruction * +optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy, + InstCombinerImpl &IC) { // We can only do this optimization if the output is a multiple of the input // element size, or the input is a multiple of the output element size. // Convert the input type to have the same element type as the output. @@ -2165,7 +2168,7 @@ /// /// Into two insertelements that do "buildvector{%inc, %inc5}". static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI, - InstCombiner &IC) { + InstCombinerImpl &IC) { VectorType *DestVecTy = cast(CI.getType()); Value *IntInput = CI.getOperand(0); @@ -2194,7 +2197,7 @@ /// vectors better than bitcasts of scalars because vector registers are /// usually not type-specific like scalar integer or scalar floating-point. static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast, - InstCombiner &IC) { + InstCombinerImpl &IC) { // TODO: Create and use a pattern matcher for ExtractElementInst. auto *ExtElt = dyn_cast(BitCast.getOperand(0)); if (!ExtElt || !ExtElt->hasOneUse()) @@ -2320,7 +2323,8 @@ /// /// All the related PHI nodes can be replaced by new PHI nodes with type A. /// The uses of \p CI can be changed to the new PHI node corresponding to \p PN. -Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { +Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI, + PHINode *PN) { // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp. if (hasStoreUsersOnly(CI)) return nullptr; @@ -2484,7 +2488,7 @@ return RetVal; } -Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { +Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { // If the operands are integer typed then apply the integer transforms, // otherwise just apply the common ones. Value *Src = CI.getOperand(0); @@ -2666,7 +2670,7 @@ return commonCastTransforms(CI); } -Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) { +Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) { // If the destination pointer element type is not the same as the source's // first do a bitcast to the destination type, and then the addrspacecast. // This allows the cast to be exposed to other transforms. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" using namespace llvm; using namespace PatternMatch; @@ -142,10 +143,10 @@ /// /// If AndCst is non-null, then the loaded value is masked with that constant /// before doing the comparison. This handles cases like "A[i]&4 == 0". -Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, - GlobalVariable *GV, - CmpInst &ICI, - ConstantInt *AndCst) { +Instruction * +InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, + GlobalVariable *GV, CmpInst &ICI, + ConstantInt *AndCst) { Constant *Init = GV->getInitializer(); if (!isa(Init) && !isa(Init)) return nullptr; @@ -422,7 +423,7 @@ /// /// If we can't emit an optimized form for this expression, this returns null. /// -static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, +static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC, const DataLayout &DL) { gep_type_iterator GTI = gep_type_begin(GEP); @@ -841,9 +842,9 @@ /// Fold comparisons between a GEP instruction and something else. At this point /// we know that the GEP is on the LHS of the comparison. -Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, - ICmpInst::Predicate Cond, - Instruction &I) { +Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, + ICmpInst::Predicate Cond, + Instruction &I) { // Don't transform signed compares of GEPs into index compares. Even if the // GEP is inbounds, the final add of the base pointer can have signed overflow // and would change the result of the icmp. @@ -1021,9 +1022,9 @@ return transformToIndexedCompare(GEPLHS, RHS, Cond, DL); } -Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI, - const AllocaInst *Alloca, - const Value *Other) { +Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI, + const AllocaInst *Alloca, + const Value *Other) { assert(ICI.isEquality() && "Cannot fold non-equality comparison."); // It would be tempting to fold away comparisons between allocas and any @@ -1099,8 +1100,8 @@ } /// Fold "icmp pred (X+C), X". -Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C, - ICmpInst::Predicate Pred) { +Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C, + ICmpInst::Predicate Pred) { // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, // so the values can never be equal. Similarly for all other "or equals" // operators. @@ -1149,9 +1150,9 @@ /// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" -> /// (icmp eq/ne A, Log2(AP2/AP1)) -> /// (icmp eq/ne A, Log2(AP2) - Log2(AP1)). -Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A, - const APInt &AP1, - const APInt &AP2) { +Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A, + const APInt &AP1, + const APInt &AP2) { assert(I.isEquality() && "Cannot fold icmp gt/lt"); auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { @@ -1208,9 +1209,9 @@ /// Handle "(icmp eq/ne (shl AP2, A), AP1)" -> /// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)). -Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A, - const APInt &AP1, - const APInt &AP2) { +Instruction *InstCombinerImpl::foldICmpShlConstConst(ICmpInst &I, Value *A, + const APInt &AP1, + const APInt &AP2) { assert(I.isEquality() && "Cannot fold icmp gt/lt"); auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { @@ -1254,7 +1255,7 @@ /// static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, ConstantInt *CI2, ConstantInt *CI1, - InstCombiner &IC) { + InstCombinerImpl &IC) { // The transformation we're trying to do here is to transform this into an // llvm.sadd.with.overflow. To do this, we have to replace the original add // with a narrower add, and discard the add-with-constant that is part of the @@ -1340,7 +1341,7 @@ /// icmp eq/ne (urem/srem %x, %y), 0 /// iff %y is a power-of-two, we can replace this with a bit test: /// icmp eq/ne (and %x, (add %y, -1)), 0 -Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) { +Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) { // This fold is only valid for equality predicates. if (!I.isEquality()) return nullptr; @@ -1359,7 +1360,7 @@ /// Fold equality-comparison between zero and any (maybe truncated) right-shift /// by one-less-than-bitwidth into a sign test on the original value. -Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) { +Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) { Instruction *Val; ICmpInst::Predicate Pred; if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero()))) @@ -1390,7 +1391,7 @@ } // Handle icmp pred X, 0 -Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) { +Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) { CmpInst::Predicate Pred = Cmp.getPredicate(); if (!match(Cmp.getOperand(1), m_Zero())) return nullptr; @@ -1431,7 +1432,7 @@ /// should be moved to some other helper and extended as noted below (it is also /// possible that code has been made unnecessary - do we canonicalize IR to /// overflow/saturating intrinsics or not?). -Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) { +Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) { // Match the following pattern, which is a common idiom when writing // overflow-safe integer arithmetic functions. The source performs an addition // in wider type and explicitly checks for overflow using comparisons against @@ -1477,7 +1478,7 @@ } /// Canonicalize icmp instructions based on dominating conditions. -Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) { +Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) { // This is a cheap/incomplete check for dominance - just match a single // predecessor with a conditional branch. BasicBlock *CmpBB = Cmp.getParent(); @@ -1547,9 +1548,9 @@ } /// Fold icmp (trunc X, Y), C. -Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp, - TruncInst *Trunc, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp, + TruncInst *Trunc, + const APInt &C) { ICmpInst::Predicate Pred = Cmp.getPredicate(); Value *X = Trunc->getOperand(0); if (C.isOneValue() && C.getBitWidth() > 1) { @@ -1580,9 +1581,9 @@ } /// Fold icmp (xor X, Y), C. -Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp, - BinaryOperator *Xor, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpXorConstant(ICmpInst &Cmp, + BinaryOperator *Xor, + const APInt &C) { Value *X = Xor->getOperand(0); Value *Y = Xor->getOperand(1); const APInt *XorC; @@ -1649,8 +1650,10 @@ } /// Fold icmp (and (sh X, Y), C2), C1. -Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And, - const APInt &C1, const APInt &C2) { +Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp, + BinaryOperator *And, + const APInt &C1, + const APInt &C2) { BinaryOperator *Shift = dyn_cast(And->getOperand(0)); if (!Shift || !Shift->isShift()) return nullptr; @@ -1733,9 +1736,9 @@ } /// Fold icmp (and X, C2), C1. -Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp, - BinaryOperator *And, - const APInt &C1) { +Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp, + BinaryOperator *And, + const APInt &C1) { bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE; // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1 @@ -1841,9 +1844,9 @@ } /// Fold icmp (and X, Y), C. -Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp, - BinaryOperator *And, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp, + BinaryOperator *And, + const APInt &C) { if (Instruction *I = foldICmpAndConstConst(Cmp, And, C)) return I; @@ -1895,8 +1898,9 @@ } /// Fold icmp (or X, Y), C. -Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp, + BinaryOperator *Or, + const APInt &C) { ICmpInst::Predicate Pred = Cmp.getPredicate(); if (C.isOneValue()) { // icmp slt signum(V) 1 --> icmp slt V, 1 @@ -1960,9 +1964,9 @@ } /// Fold icmp (mul X, Y), C. -Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp, - BinaryOperator *Mul, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp, + BinaryOperator *Mul, + const APInt &C) { const APInt *MulC; if (!match(Mul->getOperand(1), m_APInt(MulC))) return nullptr; @@ -2043,9 +2047,9 @@ } /// Fold icmp (shl X, Y), C. -Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp, - BinaryOperator *Shl, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp, + BinaryOperator *Shl, + const APInt &C) { const APInt *ShiftVal; if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal))) return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal); @@ -2183,9 +2187,9 @@ } /// Fold icmp ({al}shr X, Y), C. -Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp, - BinaryOperator *Shr, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp, + BinaryOperator *Shr, + const APInt &C) { // An exact shr only shifts out zero bits, so: // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0 Value *X = Shr->getOperand(0); @@ -2276,9 +2280,9 @@ return nullptr; } -Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp, - BinaryOperator *SRem, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp, + BinaryOperator *SRem, + const APInt &C) { // Match an 'is positive' or 'is negative' comparison of remainder by a // constant power-of-2 value: // (X % pow2C) sgt/slt 0 @@ -2315,9 +2319,9 @@ } /// Fold icmp (udiv X, Y), C. -Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp, - BinaryOperator *UDiv, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp, + BinaryOperator *UDiv, + const APInt &C) { const APInt *C2; if (!match(UDiv->getOperand(0), m_APInt(C2))) return nullptr; @@ -2344,9 +2348,9 @@ } /// Fold icmp ({su}div X, Y), C. -Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp, - BinaryOperator *Div, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp, + BinaryOperator *Div, + const APInt &C) { // Fold: icmp pred ([us]div X, C2), C -> range test // Fold this div into the comparison, producing a range check. // Determine, based on the divide type, what the range is being @@ -2514,9 +2518,9 @@ } /// Fold icmp (sub X, Y), C. -Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp, - BinaryOperator *Sub, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp, + BinaryOperator *Sub, + const APInt &C) { Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1); ICmpInst::Predicate Pred = Cmp.getPredicate(); const APInt *C2; @@ -2576,9 +2580,9 @@ } /// Fold icmp (add X, Y), C. -Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp, - BinaryOperator *Add, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp, + BinaryOperator *Add, + const APInt &C) { Value *Y = Add->getOperand(1); const APInt *C2; if (Cmp.isEquality() || !match(Y, m_APInt(C2))) @@ -2642,10 +2646,10 @@ return nullptr; } -bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, - Value *&RHS, ConstantInt *&Less, - ConstantInt *&Equal, - ConstantInt *&Greater) { +bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, + Value *&RHS, ConstantInt *&Less, + ConstantInt *&Equal, + ConstantInt *&Greater) { // TODO: Generalize this to work with other comparison idioms or ensure // they get canonicalized into this form. @@ -2682,7 +2686,8 @@ if (PredB == ICmpInst::ICMP_SGT && isa(RHS2)) { // x sgt C-1 <--> x sge C <--> not(x slt C) auto FlippedStrictness = - getFlippedStrictnessPredicateAndConstant(PredB, cast(RHS2)); + InstCombiner::getFlippedStrictnessPredicateAndConstant( + PredB, cast(RHS2)); if (!FlippedStrictness) return false; assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check"); @@ -2694,9 +2699,9 @@ return PredB == ICmpInst::ICMP_SLT && RHS == RHS2; } -Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp, - SelectInst *Select, - ConstantInt *C) { +Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp, + SelectInst *Select, + ConstantInt *C) { assert(C && "Cmp RHS should be a constant int!"); // If we're testing a constant value against the result of a three way @@ -2794,7 +2799,7 @@ const APInt *C; bool TrueIfSigned; if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() && - isSignBitCheck(Pred, *C, TrueIfSigned)) { + InstCombiner::isSignBitCheck(Pred, *C, TrueIfSigned)) { if (match(BCSrcOp, m_FPExt(m_Value(X))) || match(BCSrcOp, m_FPTrunc(m_Value(X)))) { // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0 @@ -2870,7 +2875,7 @@ /// Try to fold integer comparisons with a constant operand: icmp Pred X, C /// where X is some kind of instruction. -Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) { +Instruction *InstCombinerImpl::foldICmpInstWithConstant(ICmpInst &Cmp) { const APInt *C; if (!match(Cmp.getOperand(1), m_APInt(C))) return nullptr; @@ -2955,9 +2960,8 @@ /// Fold an icmp equality instruction with binary operator LHS and constant RHS: /// icmp eq/ne BO, C. -Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp, - BinaryOperator *BO, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant( + ICmpInst &Cmp, BinaryOperator *BO, const APInt &C) { // TODO: Some of these folds could work with arbitrary constants, but this // function is limited to scalar and vector splat constants. if (!Cmp.isEquality()) @@ -3072,9 +3076,8 @@ } /// Fold an equality icmp with LLVM intrinsic and constant operand. -Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp, - IntrinsicInst *II, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant( + ICmpInst &Cmp, IntrinsicInst *II, const APInt &C) { Type *Ty = II->getType(); unsigned BitWidth = C.getBitWidth(); switch (II->getIntrinsicID()) { @@ -3145,9 +3148,9 @@ } /// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C. -Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp, - IntrinsicInst *II, - const APInt &C) { +Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp, + IntrinsicInst *II, + const APInt &C) { if (Cmp.isEquality()) return foldICmpEqIntrinsicWithConstant(Cmp, II, C); @@ -3204,7 +3207,7 @@ } /// Handle icmp with constant (but not simple integer constant) RHS. -Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) { +Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); Constant *RHSC = dyn_cast(Op1); Instruction *LHSI = dyn_cast(Op0); @@ -3650,7 +3653,7 @@ /// @llvm.umul.with.overflow(x, y) plus extraction of overflow bit /// Note that the comparison is commutative, while inverted (u>=, ==) predicate /// will mean that we are looking for the opposite answer. -Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) { +Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) { ICmpInst::Predicate Pred; Value *X, *Y; Instruction *Mul; @@ -3716,7 +3719,8 @@ /// TODO: A large part of this logic is duplicated in InstSimplify's /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code /// duplication. -Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) { +Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, + const SimplifyQuery &SQ) { const SimplifyQuery Q = SQ.getWithInstruction(&I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -4170,7 +4174,7 @@ return nullptr; } -Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { +Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) { if (!I.isEquality()) return nullptr; @@ -4438,7 +4442,7 @@ } /// Handle icmp (cast x), (cast or constant). -Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) { +Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) { auto *CastOp0 = dyn_cast(ICmp.getOperand(0)); if (!CastOp0) return nullptr; @@ -4493,9 +4497,10 @@ } } -OverflowResult InstCombiner::computeOverflow( - Instruction::BinaryOps BinaryOp, bool IsSigned, - Value *LHS, Value *RHS, Instruction *CxtI) const { +OverflowResult +InstCombinerImpl::computeOverflow(Instruction::BinaryOps BinaryOp, + bool IsSigned, Value *LHS, Value *RHS, + Instruction *CxtI) const { switch (BinaryOp) { default: llvm_unreachable("Unsupported binary op"); @@ -4517,9 +4522,11 @@ } } -bool InstCombiner::OptimizeOverflowCheck( - Instruction::BinaryOps BinaryOp, bool IsSigned, Value *LHS, Value *RHS, - Instruction &OrigI, Value *&Result, Constant *&Overflow) { +bool InstCombinerImpl::OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp, + bool IsSigned, Value *LHS, + Value *RHS, Instruction &OrigI, + Value *&Result, + Constant *&Overflow) { if (OrigI.isCommutative() && isa(LHS) && !isa(RHS)) std::swap(LHS, RHS); @@ -4575,7 +4582,8 @@ /// \returns Instruction which must replace the compare instruction, NULL if no /// replacement required. static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, - Value *OtherVal, InstCombiner &IC) { + Value *OtherVal, + InstCombinerImpl &IC) { // Don't bother doing this transformation for pointers, don't do it for // vectors. if (!isa(MulVal->getType())) @@ -4723,7 +4731,7 @@ Function *F = Intrinsic::getDeclaration( I.getModule(), Intrinsic::umul_with_overflow, MulType); CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul"); - IC.Worklist.push(MulInstr); + IC.addToWorklist(MulInstr); // If there are uses of mul result other than the comparison, we know that // they are truncation or binary AND. Change them to use result of @@ -4750,11 +4758,11 @@ } else { llvm_unreachable("Unexpected Binary operation"); } - IC.Worklist.push(cast(U)); + IC.addToWorklist(cast(U)); } } if (isa(OtherVal)) - IC.Worklist.push(cast(OtherVal)); + IC.addToWorklist(cast(OtherVal)); // The original icmp gets replaced with the overflow value, maybe inverted // depending on predicate. @@ -4799,7 +4807,7 @@ // If this is a normal comparison, it demands all bits. If it is a sign bit // comparison, it only demands the sign bit. bool UnusedBit; - if (isSignBitCheck(I.getPredicate(), *RHS, UnusedBit)) + if (InstCombiner::isSignBitCheck(I.getPredicate(), *RHS, UnusedBit)) return APInt::getSignMask(BitWidth); switch (I.getPredicate()) { @@ -4856,9 +4864,9 @@ /// \return true when \p UI is the only use of \p DI in the parent block /// and all other uses of \p DI are in blocks dominated by \p DB. /// -bool InstCombiner::dominatesAllUses(const Instruction *DI, - const Instruction *UI, - const BasicBlock *DB) const { +bool InstCombinerImpl::dominatesAllUses(const Instruction *DI, + const Instruction *UI, + const BasicBlock *DB) const { assert(DI && UI && "Instruction not defined\n"); // Ignore incomplete definitions. if (!DI->getParent()) @@ -4931,9 +4939,9 @@ /// major restriction since a NE compare should be 'normalized' to an equal /// compare, which usually happens in the combiner and test case /// select-cmp-br.ll checks for it. -bool InstCombiner::replacedSelectWithOperand(SelectInst *SI, - const ICmpInst *Icmp, - const unsigned SIOpd) { +bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI, + const ICmpInst *Icmp, + const unsigned SIOpd) { assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!"); if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) { BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1); @@ -4959,7 +4967,7 @@ /// Try to fold the comparison based on range information we can get by checking /// whether bits are known to be zero or one in the inputs. -Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { +Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); Type *Ty = Op0->getType(); ICmpInst::Predicate Pred = I.getPredicate(); @@ -5186,8 +5194,8 @@ } llvm::Optional> -llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, - Constant *C) { +InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, + Constant *C) { assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && "Only for relational integer predicates."); @@ -5256,7 +5264,7 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { ICmpInst::Predicate Pred = I.getPredicate(); if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) || - isCanonicalPredicate(Pred)) + InstCombiner::isCanonicalPredicate(Pred)) return nullptr; Value *Op0 = I.getOperand(0); @@ -5265,7 +5273,8 @@ if (!Op1C) return nullptr; - auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C); + auto FlippedStrictness = + InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C); if (!FlippedStrictness) return nullptr; @@ -5277,11 +5286,11 @@ static CmpInst *canonicalizeICmpPredicate(CmpInst &I) { // Is the predicate already canonical? CmpInst::Predicate Pred = I.getPredicate(); - if (isCanonicalPredicate(Pred)) + if (InstCombiner::isCanonicalPredicate(Pred)) return nullptr; // Can all users be adjusted to predicate inversion? - if (!canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr)) + if (!InstCombiner::canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr)) return nullptr; // Ok, we can canonicalize comparison! @@ -5510,7 +5519,7 @@ return ExtractValueInst::Create(UAddOv, 1); } -Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { +Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) { bool Changed = false; const SimplifyQuery Q = SQ.getWithInstruction(&I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -5748,8 +5757,9 @@ } /// Fold fcmp ([us]itofp x, cst) if possible. -Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI, - Constant *RHSC) { +Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I, + Instruction *LHSI, + Constant *RHSC) { if (!isa(RHSC)) return nullptr; const APFloat &RHS = cast(RHSC)->getValueAPF(); @@ -6034,7 +6044,7 @@ } /// Optimize fabs(X) compared with zero. -static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombiner &IC) { +static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) { Value *X; if (!match(I.getOperand(0), m_Intrinsic(m_Value(X))) || !match(I.getOperand(1), m_PosZeroFP())) @@ -6096,7 +6106,7 @@ } } -Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { +Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { bool Changed = false; /// Orders the operands of the compare so that they are listed from most diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -15,40 +15,32 @@ #ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H #define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/Argument.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Constant.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/IR/Use.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" #include -#include #define DEBUG_TYPE "instcombine" using namespace llvm::PatternMatch; +// As a default, let's assume that we want to be aggressive, +// and attempt to traverse with no limits in attempt to sink negation. +static constexpr unsigned NegatorDefaultMaxDepth = ~0U; + +// Let's guesstimate that most often we will end up visiting/producing +// fairly small number of new instructions. +static constexpr unsigned NegatorMaxNodesSSO = 16; + namespace llvm { class AAResults; @@ -65,305 +57,26 @@ class TargetLibraryInfo; class User; -/// Assign a complexity or rank value to LLVM Values. This is used to reduce -/// the amount of pattern matching needed for compares and commutative -/// instructions. For example, if we have: -/// icmp ugt X, Constant -/// or -/// xor (add X, Constant), cast Z -/// -/// We do not have to consider the commuted variants of these patterns because -/// canonicalization based on complexity guarantees the above ordering. -/// -/// This routine maps IR values to various complexity ranks: -/// 0 -> undef -/// 1 -> Constants -/// 2 -> Other non-instructions -/// 3 -> Arguments -/// 4 -> Cast and (f)neg/not instructions -/// 5 -> Other instructions -static inline unsigned getComplexity(Value *V) { - if (isa(V)) { - if (isa(V) || match(V, m_Neg(m_Value())) || - match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value()))) - return 4; - return 5; - } - if (isa(V)) - return 3; - return isa(V) ? (isa(V) ? 0 : 1) : 2; -} - -/// Predicate canonicalization reduces the number of patterns that need to be -/// matched by other transforms. For example, we may swap the operands of a -/// conditional branch or select to create a compare with a canonical (inverted) -/// predicate which is then more likely to be matched with other values. -static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) { - switch (Pred) { - case CmpInst::ICMP_NE: - case CmpInst::ICMP_ULE: - case CmpInst::ICMP_SLE: - case CmpInst::ICMP_UGE: - case CmpInst::ICMP_SGE: - // TODO: There are 16 FCMP predicates. Should others be (not) canonical? - case CmpInst::FCMP_ONE: - case CmpInst::FCMP_OLE: - case CmpInst::FCMP_OGE: - return false; - default: - return true; - } -} - -/// Given an exploded icmp instruction, return true if the comparison only -/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the -/// result of the comparison is true when the input value is signed. -inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS, - bool &TrueIfSigned) { - switch (Pred) { - case ICmpInst::ICMP_SLT: // True if LHS s< 0 - TrueIfSigned = true; - return RHS.isNullValue(); - case ICmpInst::ICMP_SLE: // True if LHS s<= -1 - TrueIfSigned = true; - return RHS.isAllOnesValue(); - case ICmpInst::ICMP_SGT: // True if LHS s> -1 - TrueIfSigned = false; - return RHS.isAllOnesValue(); - case ICmpInst::ICMP_SGE: // True if LHS s>= 0 - TrueIfSigned = false; - return RHS.isNullValue(); - case ICmpInst::ICMP_UGT: - // True if LHS u> RHS and RHS == sign-bit-mask - 1 - TrueIfSigned = true; - return RHS.isMaxSignedValue(); - case ICmpInst::ICMP_UGE: - // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) - TrueIfSigned = true; - return RHS.isMinSignedValue(); - case ICmpInst::ICMP_ULT: - // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) - TrueIfSigned = false; - return RHS.isMinSignedValue(); - case ICmpInst::ICMP_ULE: - // True if LHS u<= RHS and RHS == sign-bit-mask - 1 - TrueIfSigned = false; - return RHS.isMaxSignedValue(); - default: - return false; - } -} - -llvm::Optional> -getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C); - -/// Return the source operand of a potentially bitcasted value while optionally -/// checking if it has one use. If there is no bitcast or the one use check is -/// not met, return the input value itself. -static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) { - if (auto *BitCast = dyn_cast(V)) - if (!OneUseOnly || BitCast->hasOneUse()) - return BitCast->getOperand(0); - - // V is not a bitcast or V has more than one use and OneUseOnly is true. - return V; -} - -/// Add one to a Constant -static inline Constant *AddOne(Constant *C) { - return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); -} - -/// Subtract one from a Constant -static inline Constant *SubOne(Constant *C) { - return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); -} - -/// Return true if the specified value is free to invert (apply ~ to). -/// This happens in cases where the ~ can be eliminated. If WillInvertAllUses -/// is true, work under the assumption that the caller intends to remove all -/// uses of V and only keep uses of ~V. -/// -/// See also: canFreelyInvertAllUsersOf() -static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) { - // ~(~(X)) -> X. - if (match(V, m_Not(m_Value()))) - return true; - - // Constants can be considered to be not'ed values. - if (match(V, m_AnyIntegralConstant())) - return true; - - // Compares can be inverted if all of their uses are being modified to use the - // ~V. - if (isa(V)) - return WillInvertAllUses; - - // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1 - // - Constant) - A` if we are willing to invert all of the uses. - if (BinaryOperator *BO = dyn_cast(V)) - if (BO->getOpcode() == Instruction::Add || - BO->getOpcode() == Instruction::Sub) - if (isa(BO->getOperand(0)) || isa(BO->getOperand(1))) - return WillInvertAllUses; - - // Selects with invertible operands are freely invertible - if (match(V, m_Select(m_Value(), m_Not(m_Value()), m_Not(m_Value())))) - return WillInvertAllUses; - - return false; -} - -/// Given i1 V, can every user of V be freely adapted if V is changed to !V ? -/// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this fn. -/// -/// See also: isFreeToInvert() -static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) { - // Look at every user of V. - for (Use &U : V->uses()) { - if (U.getUser() == IgnoredUser) - continue; // Don't consider this user. - - auto *I = cast(U.getUser()); - switch (I->getOpcode()) { - case Instruction::Select: - if (U.getOperandNo() != 0) // Only if the value is used as select cond. - return false; - break; - case Instruction::Br: - assert(U.getOperandNo() == 0 && "Must be branching on that value."); - break; // Free to invert by swapping true/false values/destinations. - case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it. - if (!match(I, m_Not(m_Value()))) - return false; // Not a 'not'. - break; - default: - return false; // Don't know, likely not freely invertible. - } - // So far all users were free to invert... - } - return true; // Can freely invert all users! -} - -/// Some binary operators require special handling to avoid poison and undefined -/// behavior. If a constant vector has undef elements, replace those undefs with -/// identity constants if possible because those are always safe to execute. -/// If no identity constant exists, replace undef with some other safe constant. -static inline Constant *getSafeVectorConstantForBinop( - BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) { - auto *InVTy = dyn_cast(In->getType()); - assert(InVTy && "Not expecting scalars here"); - - Type *EltTy = InVTy->getElementType(); - auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant); - if (!SafeC) { - // TODO: Should this be available as a constant utility function? It is - // similar to getBinOpAbsorber(). - if (IsRHSConstant) { - switch (Opcode) { - case Instruction::SRem: // X % 1 = 0 - case Instruction::URem: // X %u 1 = 0 - SafeC = ConstantInt::get(EltTy, 1); - break; - case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe) - SafeC = ConstantFP::get(EltTy, 1.0); - break; - default: - llvm_unreachable("Only rem opcodes have no identity constant for RHS"); - } - } else { - switch (Opcode) { - case Instruction::Shl: // 0 << X = 0 - case Instruction::LShr: // 0 >>u X = 0 - case Instruction::AShr: // 0 >> X = 0 - case Instruction::SDiv: // 0 / X = 0 - case Instruction::UDiv: // 0 /u X = 0 - case Instruction::SRem: // 0 % X = 0 - case Instruction::URem: // 0 %u X = 0 - case Instruction::Sub: // 0 - X (doesn't simplify, but it is safe) - case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe) - case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe) - case Instruction::FRem: // 0.0 % X = 0 - SafeC = Constant::getNullValue(EltTy); - break; - default: - llvm_unreachable("Expected to find identity constant for opcode"); - } - } - } - assert(SafeC && "Must have safe constant for binop"); - unsigned NumElts = InVTy->getNumElements(); - SmallVector Out(NumElts); - for (unsigned i = 0; i != NumElts; ++i) { - Constant *C = In->getAggregateElement(i); - Out[i] = isa(C) ? SafeC : C; - } - return ConstantVector::get(Out); -} - -/// The core instruction combiner logic. -/// -/// This class provides both the logic to recursively visit instructions and -/// combine them. -class LLVM_LIBRARY_VISIBILITY InstCombiner - : public InstVisitor { - // FIXME: These members shouldn't be public. +class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final + : public InstCombiner, + public InstVisitor { public: - /// A worklist of the instructions that need to be simplified. - InstCombineWorklist &Worklist; - - /// An IRBuilder that automatically inserts new instructions into the - /// worklist. - using BuilderTy = IRBuilder; - BuilderTy &Builder; - -private: - // Mode in which we are running the combiner. - const bool MinimizeSize; + InstCombinerImpl(InstCombineWorklist &Worklist, BuilderTy &Builder, + bool MinimizeSize, AAResults *AA, AssumptionCache &AC, + TargetLibraryInfo &TLI, TargetTransformInfo &TTI, + DominatorTree &DT, OptimizationRemarkEmitter &ORE, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + const DataLayout &DL, LoopInfo *LI) + : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE, + BFI, PSI, DL, LI) {} - AAResults *AA; - - // Required analyses. - AssumptionCache &AC; - TargetLibraryInfo &TLI; - DominatorTree &DT; - const DataLayout &DL; - const SimplifyQuery SQ; - OptimizationRemarkEmitter &ORE; - BlockFrequencyInfo *BFI; - ProfileSummaryInfo *PSI; - - // Optional analyses. When non-null, these can both be used to do better - // combining and will be updated to reflect any changes. - LoopInfo *LI; - - bool MadeIRChange = false; - -public: - InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder, - bool MinimizeSize, AAResults *AA, - AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, - OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) - : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), - AA(AA), AC(AC), TLI(TLI), DT(DT), - DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} + virtual ~InstCombinerImpl() {} /// Run the combiner over the entire worklist until it is empty. /// /// \returns true if the IR is changed. bool run(); - AssumptionCache &getAssumptionCache() const { return AC; } - - const DataLayout &getDataLayout() const { return DL; } - - DominatorTree &getDominatorTree() const { return DT; } - - LoopInfo *getLoopInfo() const { return LI; } - - TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; } - // Visitation implementation - Implement instruction combining for different // instruction types. The semantics are as follows: // Return Value: @@ -726,7 +439,7 @@ /// When dealing with an instruction that has side effects or produces a void /// value, we can't rely on DCE to delete the instruction. Instead, visit /// methods should return the value returned by this function. - Instruction *eraseInstFromFunction(Instruction &I) { + Instruction *eraseInstFromFunction(Instruction &I) override { LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n'); assert(I.use_empty() && "Cannot erase instruction that is used!"); salvageDebugInfo(I); @@ -808,10 +521,6 @@ Instruction::BinaryOps BinaryOp, bool IsSigned, Value *LHS, Value *RHS, Instruction *CxtI) const; - /// Maximum size of array considered when transforming. - uint64_t MaxArraySizeForCombine = 0; - -private: /// Performs a few simplifications for operators which are associative /// or commutative. bool SimplifyAssociativeOrCommutative(BinaryOperator &I); @@ -857,7 +566,7 @@ unsigned Depth, Instruction *CxtI); bool SimplifyDemandedBits(Instruction *I, unsigned Op, const APInt &DemandedMask, KnownBits &Known, - unsigned Depth = 0); + unsigned Depth = 0) override; /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne /// bits. It also tries to handle simplifications that can be done based on @@ -877,13 +586,10 @@ /// demanded bits. bool SimplifyDemandedInstructionBits(Instruction &Inst); - Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, - APInt DemandedElts, - int DmaskIdx = -1); - - Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, - APInt &UndefElts, unsigned Depth = 0, - bool AllowMultipleUsers = false); + virtual Value * + SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, + unsigned Depth = 0, + bool AllowMultipleUsers = false) override; /// Canonicalize the position of binops relative to shufflevector. Instruction *foldVectorBinop(BinaryOperator &Inst); @@ -1023,18 +729,6 @@ Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); }; -namespace { - -// As a default, let's assume that we want to be aggressive, -// and attempt to traverse with no limits in attempt to sink negation. -static constexpr unsigned NegatorDefaultMaxDepth = ~0U; - -// Let's guesstimate that most often we will end up visiting/producing -// fairly small number of new instructions. -static constexpr unsigned NegatorMaxNodesSSO = 16; - -} // namespace - class Negator final { /// Top-to-bottom, def-to-use negated instruction tree we produced. SmallVector NewInstructions; @@ -1078,7 +772,7 @@ /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed, /// otherwise returns negated value. LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root, - InstCombiner &IC); + InstCombinerImpl &IC); }; } // end namespace llvm diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -166,7 +167,8 @@ APInt(64, AllocaSize), DL); } -static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { +static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC, + AllocaInst &AI) { // Check for array size of 1 (scalar allocation). if (!AI.isArrayAllocation()) { // i32 1 is the canonical array size for scalar allocations. @@ -234,7 +236,7 @@ // instruction. class PointerReplacer { public: - PointerReplacer(InstCombiner &IC) : IC(IC) {} + PointerReplacer(InstCombinerImpl &IC) : IC(IC) {} void replacePointer(Instruction &I, Value *V); private: @@ -244,7 +246,7 @@ SmallVector Path; MapVector WorkMap; - InstCombiner &IC; + InstCombinerImpl &IC; }; } // end anonymous namespace @@ -323,7 +325,7 @@ findLoadAndReplace(I); } -Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { +Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) { if (auto *I = simplifyAllocaArraySize(*this, AI)) return I; @@ -421,9 +423,9 @@ /// that pointer type, load it, etc. /// /// Note that this will create all of the instructions with whatever insert -/// point the \c InstCombiner currently is using. -LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy, - const Twine &Suffix) { +/// point the \c InstCombinerImpl currently is using. +LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy, + const Twine &Suffix) { assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) && "can't fold an atomic load to requested type"); @@ -445,7 +447,8 @@ /// Combine a store to a new type. /// /// Returns the newly created store instruction. -static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) { +static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI, + Value *V) { assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) && "can't fold an atomic store of requested type"); @@ -502,7 +505,7 @@ static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) { assert(V->getType()->isPointerTy() && "Expected pointer type."); // Ignore possible ty* to ixx* bitcast. - V = peekThroughBitcast(V); + V = InstCombiner::peekThroughBitcast(V); // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax // pattern. CmpInst::Predicate Pred; @@ -537,7 +540,8 @@ /// or a volatile load. This is debatable, and might be reasonable to change /// later. However, it is risky in case some backend or other part of LLVM is /// relying on the exact type loaded to select appropriate atomic operations. -static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { +static Instruction *combineLoadToOperationType(InstCombinerImpl &IC, + LoadInst &LI) { // FIXME: We could probably with some care handle both volatile and ordered // atomic loads here but it isn't clear that this is important. if (!LI.isUnordered()) @@ -563,9 +567,9 @@ if (!Ty->isIntegerTy() && Ty->isSized() && !isa(Ty) && DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) && DL.typeSizeEqualsStoreSize(Ty) && !DL.isNonIntegralPointerType(Ty) && - !isMinMaxWithLoads( - peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true), - Dummy)) { + !isMinMaxWithLoads(InstCombiner::peekThroughBitcast( + LI.getPointerOperand(), /*OneUseOnly=*/true), + Dummy)) { if (all_of(LI.users(), [&LI](User *U) { auto *SI = dyn_cast(U); return SI && SI->getPointerOperand() != &LI && @@ -605,7 +609,7 @@ return nullptr; } -static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { +static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) { // FIXME: We could probably with some care handle both volatile and atomic // stores here but it isn't clear that this is important. if (!LI.isSimple()) @@ -804,8 +808,9 @@ // not zero. Currently, we only handle the first such index. Also, we could // also search through non-zero constant indices if we kept track of the // offsets those indices implied. -static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, - Instruction *MemI, unsigned &Idx) { +static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC, + GetElementPtrInst *GEPI, Instruction *MemI, + unsigned &Idx) { if (GEPI->getNumOperands() < 2) return false; @@ -874,7 +879,7 @@ // access, but the object has only one element, we can assume that the index // will always be zero. If we replace the GEP, return it. template -static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr, +static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr, T &MemI) { if (GetElementPtrInst *GEPI = dyn_cast(Ptr)) { unsigned Idx; @@ -916,7 +921,7 @@ return false; } -Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { +Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) { Value *Op = LI.getOperand(0); // Try to canonicalize the loaded type. @@ -1033,7 +1038,7 @@ /// and the layout of a <2 x double> is isomorphic to a [2 x double], /// then %V1 can be safely approximated by a conceptual "bitcast" of %U. /// Note that %U may contain non-undef values where %V1 has undef. -static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) { +static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) { Value *U = nullptr; while (auto *IV = dyn_cast(V)) { auto *E = dyn_cast(IV->getInsertedValueOperand()); @@ -1094,7 +1099,7 @@ /// the caller must erase the store instruction. We have to let the caller erase /// the store instruction as otherwise there is no way to signal whether it was /// combined or not: IC.EraseInstFromFunction returns a null pointer. -static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { +static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) { // FIXME: We could probably with some care handle both volatile and ordered // atomic stores here but it isn't clear that this is important. if (!SI.isUnordered()) @@ -1126,7 +1131,7 @@ return false; } -static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { +static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) { // FIXME: We could probably with some care handle both volatile and atomic // stores here but it isn't clear that this is important. if (!SI.isSimple()) @@ -1266,7 +1271,7 @@ /// Converts store (bitcast (load (bitcast (select ...)))) to /// store (load (select ...)), where select is minmax: /// select ((cmp load V1, load V2), V1, V2). -static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC, +static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC, StoreInst &SI) { // bitcast? if (!match(SI.getPointerOperand(), m_BitCast(m_Value()))) @@ -1296,7 +1301,8 @@ if (!all_of(LI->users(), [LI, LoadAddr](User *U) { auto *SI = dyn_cast(U); return SI && SI->getPointerOperand() != LI && - peekThroughBitcast(SI->getPointerOperand()) != LoadAddr && + InstCombiner::peekThroughBitcast(SI->getPointerOperand()) != + LoadAddr && !SI->getPointerOperand()->isSwiftError(); })) return false; @@ -1314,7 +1320,7 @@ return true; } -Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { +Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) { Value *Val = SI.getOperand(0); Value *Ptr = SI.getOperand(1); @@ -1433,7 +1439,7 @@ /// or: /// *P = v1; if () { *P = v2; } /// into a phi node with a store in the successor. -bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) { +bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) { if (!SI.isUnordered()) return false; // This code has not been audited for volatile/ordered case. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -32,6 +32,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include #include @@ -46,7 +47,7 @@ /// The specific integer value is used in a context where it is known to be /// non-zero. If this allows us to simplify the computation, do so and return /// the new operand, otherwise return null. -static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, +static Value *simplifyValueKnownNonZero(Value *V, InstCombinerImpl &IC, Instruction &CxtI) { // If V has multiple uses, then we would have to do more analysis to determine // if this is safe. For example, the use could be in dynamically unreached @@ -171,7 +172,7 @@ return nullptr; } -Instruction *InstCombiner::visitMul(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -423,7 +424,7 @@ return Changed ? &I : nullptr; } -Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) { +Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) { BinaryOperator::BinaryOps Opcode = I.getOpcode(); assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) && "Expected fmul or fdiv"); @@ -457,7 +458,7 @@ return nullptr; } -Instruction *InstCombiner::visitFMul(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) { if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1), I.getFastMathFlags(), SQ.getWithInstruction(&I))) @@ -637,7 +638,7 @@ /// Fold a divide or remainder with a select instruction divisor when one of the /// select operands is zero. In that case, we can use the other select operand /// because div/rem by zero is undefined. -bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) { +bool InstCombinerImpl::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) { SelectInst *SI = dyn_cast(I.getOperand(1)); if (!SI) return false; @@ -738,7 +739,7 @@ /// instructions (udiv and sdiv). It is called by the visitors to those integer /// division instructions. /// Common integer divide transforms -Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { +Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); bool IsSigned = I.getOpcode() == Instruction::SDiv; Type *Ty = I.getType(); @@ -874,7 +875,7 @@ using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1, const BinaryOperator &I, - InstCombiner &IC); + InstCombinerImpl &IC); /// Used to maintain state for visitUDivOperand(). struct UDivFoldAction { @@ -903,7 +904,8 @@ // X udiv 2^C -> X >> C static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1, - const BinaryOperator &I, InstCombiner &IC) { + const BinaryOperator &I, + InstCombinerImpl &IC) { Constant *C1 = getLogBase2(Op0->getType(), cast(Op1)); if (!C1) llvm_unreachable("Failed to constant fold udiv -> logbase2"); @@ -916,7 +918,7 @@ // X udiv (C1 << N), where C1 is "1< X >> (N+C2) // X udiv (zext (C1 << N)), where C1 is "1< X >> (N+C2) static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I, - InstCombiner &IC) { + InstCombinerImpl &IC) { Value *ShiftLeft; if (!match(Op1, m_ZExt(m_Value(ShiftLeft)))) ShiftLeft = Op1; @@ -1010,7 +1012,7 @@ return nullptr; } -Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) { if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -1104,7 +1106,7 @@ return nullptr; } -Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) { if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -1265,7 +1267,7 @@ return BinaryOperator::CreateFDivFMF(NewC, X, &I); } -Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1), I.getFastMathFlags(), SQ.getWithInstruction(&I))) @@ -1372,7 +1374,7 @@ /// instructions (urem and srem). It is called by the visitors to those integer /// remainder instructions. /// Common integer remainder transforms -Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { +Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // The RHS is known non-zero. @@ -1410,7 +1412,7 @@ return nullptr; } -Instruction *InstCombiner::visitURem(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) { if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -1461,7 +1463,7 @@ return nullptr; } -Instruction *InstCombiner::visitSRem(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) { if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -1484,7 +1486,7 @@ // -X srem Y --> -(X srem Y) Value *X, *Y; if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y)))) - return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y)); + return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y)); // If the sign bits of both operands are zero (i.e. we can prove they are // unsigned inputs), turn this into a urem. @@ -1533,7 +1535,7 @@ return nullptr; } -Instruction *InstCombiner::visitFRem(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) { if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1), I.getFastMathFlags(), SQ.getWithInstruction(&I))) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp @@ -42,6 +42,7 @@ #include "llvm/Support/DebugCounter.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include #include #include @@ -430,7 +431,7 @@ } LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root, - InstCombiner &IC) { + InstCombinerImpl &IC) { ++NegatorTotalNegationsAttempted; LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root << "\n"); diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -17,6 +17,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -30,7 +31,7 @@ /// The PHI arguments will be folded into a single operation with a PHI node /// as input. The debug location of the single operation will be the merged /// locations of the original PHI node arguments. -void InstCombiner::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) { +void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) { auto *FirstInst = cast(PN.getIncomingValue(0)); Inst->setDebugLoc(FirstInst->getDebugLoc()); // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc @@ -93,7 +94,7 @@ // ptr_val_inc = ... // ... // -Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) { +Instruction *InstCombinerImpl::FoldIntegerTypedPHI(PHINode &PN) { if (!PN.getType()->isIntegerTy()) return nullptr; if (!PN.hasOneUse()) @@ -292,7 +293,7 @@ /// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the /// adds all have a single use, turn this into a phi and a single binop. -Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { +Instruction *InstCombinerImpl::FoldPHIArgBinOpIntoPHI(PHINode &PN) { Instruction *FirstInst = cast(PN.getIncomingValue(0)); assert(isa(FirstInst) || isa(FirstInst)); unsigned Opc = FirstInst->getOpcode(); @@ -385,7 +386,7 @@ return NewBinOp; } -Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { +Instruction *InstCombinerImpl::FoldPHIArgGEPIntoPHI(PHINode &PN) { GetElementPtrInst *FirstInst =cast(PN.getIncomingValue(0)); SmallVector FixedOperands(FirstInst->op_begin(), @@ -494,7 +495,6 @@ return NewGEP; } - /// Return true if we know that it is safe to sink the load out of the block /// that defines it. This means that it must be obvious the value of the load is /// not changed from the point of the load to the end of the block it is in. @@ -540,7 +540,7 @@ return true; } -Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { +Instruction *InstCombinerImpl::FoldPHIArgLoadIntoPHI(PHINode &PN) { LoadInst *FirstLI = cast(PN.getIncomingValue(0)); // FIXME: This is overconservative; this transform is allowed in some cases @@ -654,7 +654,7 @@ /// TODO: This function could handle other cast types, but then it might /// require special-casing a cast from the 'i1' type. See the comment in /// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. -Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { +Instruction *InstCombinerImpl::FoldPHIArgZextsIntoPHI(PHINode &Phi) { // We cannot create a new instruction after the PHI if the terminator is an // EHPad because there is no valid insertion point. if (Instruction *TI = Phi.getParent()->getTerminator()) @@ -728,7 +728,7 @@ /// If all operands to a PHI node are the same "unary" operator and they all are /// only used by the PHI, PHI together their inputs, and do the operation once, /// to the result of the PHI. -Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { +Instruction *InstCombinerImpl::FoldPHIArgOpIntoPHI(PHINode &PN) { // We cannot create a new instruction after the PHI if the terminator is an // EHPad because there is no valid insertion point. if (Instruction *TI = PN.getParent()->getTerminator()) @@ -955,7 +955,7 @@ /// TODO: The user of the trunc may be an bitcast to float/double/vector or an /// inttoptr. We should produce new PHIs in the right type. /// -Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { +Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { // PHIUsers - Keep track of all of the truncated values extracted from a set // of PHIs, along with their offset. These are the things we want to rewrite. SmallVector PHIUsers; @@ -1203,7 +1203,7 @@ // PHINode simplification // -Instruction *InstCombiner::visitPHINode(PHINode &PN) { +Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) { if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN))) return replaceInstUsesWith(PN, V); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -38,6 +38,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include #include @@ -57,7 +58,7 @@ /// constant of a binop. static Instruction *foldSelectBinOpIdentity(SelectInst &Sel, const TargetLibraryInfo &TLI, - InstCombiner &IC) { + InstCombinerImpl &IC) { // The select condition must be an equality compare with a constant operand. Value *X; Constant *C; @@ -279,8 +280,8 @@ } /// We have (select c, TI, FI), and we know that TI and FI have the same opcode. -Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI, - Instruction *FI) { +Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI, + Instruction *FI) { // Don't break up min/max patterns. The hasOneUse checks below prevent that // for most cases, but vector min/max with bitcasts can be transformed. If the // one-use restrictions are eased for other patterns, we still don't want to @@ -418,8 +419,8 @@ /// Try to fold the select into one of the operands to allow further /// optimization. -Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, - Value *FalseVal) { +Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal, + Value *FalseVal) { // See the comment above GetSelectFoldableOperands for a description of the // transformation we are doing here. if (auto *TVI = dyn_cast(TrueVal)) { @@ -1024,9 +1025,9 @@ /// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2 /// Note: if C1 != C2, this will change the icmp constant to the existing /// constant operand of the select. -static Instruction * -canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp, - InstCombiner &IC) { +static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel, + ICmpInst &Cmp, + InstCombinerImpl &IC) { if (!Cmp.hasOneUse() || !isa(Cmp.getOperand(1))) return nullptr; @@ -1070,7 +1071,7 @@ /// Canonicalize all these variants to 1 pattern. /// This makes CSE more likely. static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp, - InstCombiner &IC) { + InstCombinerImpl &IC) { if (!Cmp.hasOneUse() || !isa(Cmp.getOperand(1))) return nullptr; @@ -1253,7 +1254,7 @@ APInt::getAllOnesValue( C0->getType()->getScalarSizeInBits())))) return nullptr; // Can't do, have all-ones element[s]. - C0 = AddOne(C0); + C0 = InstCombiner::AddOne(C0); std::swap(X, Sel1); break; case ICmpInst::Predicate::ICMP_UGE: @@ -1313,7 +1314,7 @@ APInt::getSignedMaxValue( C2->getType()->getScalarSizeInBits())))) return nullptr; // Can't do, have signed max element[s]. - C2 = AddOne(C2); + C2 = InstCombiner::AddOne(C2); LLVM_FALLTHROUGH; case ICmpInst::Predicate::ICMP_SGE: // Also non-canonical, but here we don't need to change C2, @@ -1360,7 +1361,7 @@ // and swap the hands of select. static Instruction * tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, - InstCombiner &IC) { + InstCombinerImpl &IC) { ICmpInst::Predicate Pred; Value *X; Constant *C0; @@ -1375,7 +1376,7 @@ // If comparison predicate is non-canonical, then we certainly won't be able // to make it canonical; canonicalizeCmpWithConstant() already tried. - if (!isCanonicalPredicate(Pred)) + if (!InstCombiner::isCanonicalPredicate(Pred)) return nullptr; // If the [input] type of comparison and select type are different, lets abort @@ -1403,7 +1404,8 @@ return nullptr; // Check the constant we'd have with flipped-strictness predicate. - auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0); + auto FlippedStrictness = + InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0); if (!FlippedStrictness) return nullptr; @@ -1426,8 +1428,8 @@ } /// Visit a SelectInst that has an ICmpInst as its first operand. -Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI, - ICmpInst *ICI) { +Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, + ICmpInst *ICI) { if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ)) return replaceInstUsesWith(SI, V); @@ -1579,11 +1581,11 @@ /// We have an SPF (e.g. a min or max) of an SPF of the form: /// SPF2(SPF1(A, B), C) -Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner, - SelectPatternFlavor SPF1, - Value *A, Value *B, - Instruction &Outer, - SelectPatternFlavor SPF2, Value *C) { +Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner, + SelectPatternFlavor SPF1, Value *A, + Value *B, Instruction &Outer, + SelectPatternFlavor SPF2, + Value *C) { if (Outer.getType() != Inner->getType()) return nullptr; @@ -1900,7 +1902,7 @@ return CallInst::Create(F, {X, Y}); } -Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) { +Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) { Constant *C; if (!match(Sel.getTrueValue(), m_Constant(C)) && !match(Sel.getFalseValue(), m_Constant(C))) @@ -2001,8 +2003,8 @@ /// to a vector select by splatting the condition. A splat may get folded with /// other operations in IR and having all operands of a select be vector types /// is likely better for vector codegen. -static Instruction *canonicalizeScalarSelectOfVecs( - SelectInst &Sel, InstCombiner &IC) { +static Instruction *canonicalizeScalarSelectOfVecs(SelectInst &Sel, + InstCombinerImpl &IC) { auto *Ty = dyn_cast(Sel.getType()); if (!Ty) return nullptr; @@ -2172,7 +2174,7 @@ } /// Match a sadd_sat or ssub_sat which is using min/max to clamp the value. -Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) { +Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) { Type *Ty = MinMax1.getType(); // We are looking for a tree of: @@ -2372,7 +2374,8 @@ bool IsTrueIfSignSet; ICmpInst::Predicate Pred; if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) || - !isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType) + !InstCombiner::isSignBitCheck(Pred, *C, IsTrueIfSignSet) || + X->getType() != SelType) return nullptr; // If needed, negate the value that will be the sign argument of the copysign: @@ -2393,7 +2396,7 @@ return CopySign; } -Instruction *InstCombiner::foldVectorSelect(SelectInst &Sel) { +Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) { auto *VecTy = dyn_cast(Sel.getType()); if (!VecTy) return nullptr; @@ -2523,7 +2526,7 @@ return nullptr; } -Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { +Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { Value *CondVal = SI.getCondition(); Value *TrueVal = SI.getTrueValue(); Value *FalseVal = SI.getFalseValue(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" using namespace llvm; using namespace PatternMatch; @@ -31,7 +32,7 @@ // // AnalyzeForSignBitExtraction indicates that we will only analyze whether this // pattern has any 2 right-shifts that sum to 1 less than original bit width. -Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts( +Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts( BinaryOperator *Sh0, const SimplifyQuery &SQ, bool AnalyzeForSignBitExtraction) { // Look for a shift of some instruction, ignore zext of shift amount if any. @@ -360,7 +361,7 @@ return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2); } -Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { +Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); assert(Op0->getType() == Op1->getType()); @@ -420,8 +421,8 @@ /// Return true if we can simplify two logical (either left or right) shifts /// that have constant shift amounts: OuterShift (InnerShift X, C1), C2. static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl, - Instruction *InnerShift, InstCombiner &IC, - Instruction *CxtI) { + Instruction *InnerShift, + InstCombinerImpl &IC, Instruction *CxtI) { assert(InnerShift->isLogicalShift() && "Unexpected instruction type"); // We need constant scalar or constant splat shifts. @@ -472,7 +473,7 @@ /// where the client will ask if E can be computed shifted right by 64-bits. If /// this succeeds, getShiftedValue() will be called to produce the value. static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift, - InstCombiner &IC, Instruction *CxtI) { + InstCombinerImpl &IC, Instruction *CxtI) { // We can always evaluate constants shifted. if (isa(V)) return true; @@ -608,7 +609,7 @@ /// When canEvaluateShifted() returns true for an expression, this function /// inserts the new computation that produces the shifted value. static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, - InstCombiner &IC, const DataLayout &DL) { + InstCombinerImpl &IC, const DataLayout &DL) { // We can always evaluate constants shifted. if (Constant *C = dyn_cast(V)) { if (isLeftShift) @@ -618,7 +619,7 @@ } Instruction *I = cast(V); - IC.Worklist.push(I); + IC.addToWorklist(I); switch (I->getOpcode()) { default: llvm_unreachable("Inconsistency with CanEvaluateShifted"); @@ -672,8 +673,8 @@ } } -Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1, - BinaryOperator &I) { +Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1, + BinaryOperator &I) { bool isLeftShift = I.getOpcode() == Instruction::Shl; const APInt *Op1C; @@ -915,7 +916,7 @@ return nullptr; } -Instruction *InstCombiner::visitShl(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) { const SimplifyQuery Q = SQ.getWithInstruction(&I); if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1), @@ -1037,7 +1038,7 @@ return nullptr; } -Instruction *InstCombiner::visitLShr(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); @@ -1167,7 +1168,7 @@ } Instruction * -InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract( +InstCombinerImpl::foldVariableSignZeroExtensionOfVariableHighBitExtract( BinaryOperator &OldAShr) { assert(OldAShr.getOpcode() == Instruction::AShr && "Must be called with arithmetic right-shift instruction only."); @@ -1235,7 +1236,7 @@ return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType()); } -Instruction *InstCombiner::visitAShr(BinaryOperator &I) { +Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) { if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), SQ.getWithInstruction(&I))) return replaceInstUsesWith(I, V); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -12,29 +12,18 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "instcombine" -namespace { - -struct AMDGPUImageDMaskIntrinsic { - unsigned Intr; -}; - -#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL -#include "InstCombineTables.inc" - -} // end anonymous namespace - /// Check to see if the specified operand of the specified instruction is a /// constant integer. If so, check to see if there are any bits set in the /// constant that are not demanded. If so, shrink the constant and return true. @@ -63,7 +52,7 @@ /// Inst is an integer instruction that SimplifyDemandedBits knows about. See if /// the instruction has any properties that allow us to simplify its operands. -bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { +bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) { unsigned BitWidth = Inst.getType()->getScalarSizeInBits(); KnownBits Known(BitWidth); APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); @@ -79,22 +68,20 @@ /// This form of SimplifyDemandedBits simplifies the specified instruction /// operand if possible, updating it in place. It returns true if it made any /// change and false otherwise. -bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo, - const APInt &DemandedMask, - KnownBits &Known, - unsigned Depth) { +bool InstCombinerImpl::SimplifyDemandedBits(Instruction *I, unsigned OpNo, + const APInt &DemandedMask, + KnownBits &Known, unsigned Depth) { Use &U = I->getOperandUse(OpNo); Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known, Depth, I); if (!NewVal) return false; if (Instruction* OpInst = dyn_cast(U)) salvageDebugInfo(*OpInst); - + replaceUse(U, NewVal); return true; } - /// This function attempts to replace V with a simpler value based on the /// demanded bits. When this function is called, it is known that only the bits /// set in DemandedMask of the result of V are ever used downstream. @@ -118,9 +105,10 @@ /// operands based on the information about what bits are demanded. This returns /// some other non-null value if it found out that V is equal to another value /// in the context where the specified bits are demanded, but not for all users. -Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, - KnownBits &Known, unsigned Depth, - Instruction *CxtI) { +Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, + KnownBits &Known, + unsigned Depth, + Instruction *CxtI) { assert(V != nullptr && "Null pointer of Value???"); assert(Depth <= 6 && "Limit Search Depth"); uint32_t BitWidth = DemandedMask.getBitWidth(); @@ -728,7 +716,6 @@ bool KnownBitsComputed = false; if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { - default: break; case Intrinsic::bswap: { // If the only bits demanded come from one byte of the bswap result, // just shift the input byte into position to eliminate the bswap. @@ -784,39 +771,14 @@ KnownBitsComputed = true; break; } - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_avx2_pmovmskb: { - // MOVMSK copies the vector elements' sign bits to the low bits - // and zeros the high bits. - unsigned ArgWidth; - if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { - ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. - } else { - auto Arg = II->getArgOperand(0); - auto ArgType = cast(Arg->getType()); - ArgWidth = ArgType->getNumElements(); - } - - // If we don't need any of low bits then return zero, - // we know that DemandedMask is non-zero already. - APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); - if (DemandedElts.isNullValue()) - return ConstantInt::getNullValue(VTy); - - // We know that the upper bits are set to zero. - Known.Zero.setBitsFrom(ArgWidth); - KnownBitsComputed = true; + default: { + // Handle target specific intrinsics + Optional V = targetSimplifyDemandedUseBitsIntrinsic( + *II, DemandedMask, Known, KnownBitsComputed); + if (V.hasValue()) + return V.getValue(); break; } - case Intrinsic::x86_sse42_crc32_64_64: - Known.Zero.setBitsFrom(32); - KnownBitsComputed = true; - break; } } @@ -836,11 +798,9 @@ /// Helper routine of SimplifyDemandedUseBits. It computes Known /// bits. It also tries to handle simplifications that can be done based on /// DemandedMask, but without modifying the Instruction. -Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I, - const APInt &DemandedMask, - KnownBits &Known, - unsigned Depth, - Instruction *CxtI) { +Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits( + Instruction *I, const APInt &DemandedMask, KnownBits &Known, unsigned Depth, + Instruction *CxtI) { unsigned BitWidth = DemandedMask.getBitWidth(); Type *ITy = I->getType(); @@ -940,7 +900,6 @@ return nullptr; } - /// Helper routine of SimplifyDemandedUseBits. It tries to simplify /// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into /// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign @@ -958,11 +917,9 @@ /// /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was /// not successful. -Value * -InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, - Instruction *Shl, const APInt &ShlOp1, - const APInt &DemandedMask, - KnownBits &Known) { +Value *InstCombinerImpl::simplifyShrShlDemandedBits( + Instruction *Shr, const APInt &ShrOp1, Instruction *Shl, + const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known) { if (!ShlOp1 || !ShrOp1) return nullptr; // No-op. @@ -1022,153 +979,6 @@ return nullptr; } -/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. -/// -/// Note: This only supports non-TFE/LWE image intrinsic calls; those have -/// struct returns. -Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, - APInt DemandedElts, - int DMaskIdx) { - - // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported. - if (DMaskIdx < 0 && - II->getType()->getScalarSizeInBits() != 32 && - DemandedElts.getActiveBits() == 3) - return nullptr; - - auto *IIVTy = cast(II->getType()); - unsigned VWidth = IIVTy->getNumElements(); - if (VWidth == 1) - return nullptr; - - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(II); - - // Assume the arguments are unchanged and later override them, if needed. - SmallVector Args(II->arg_begin(), II->arg_end()); - - if (DMaskIdx < 0) { - // Buffer case. - - const unsigned ActiveBits = DemandedElts.getActiveBits(); - const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); - - // Start assuming the prefix of elements is demanded, but possibly clear - // some other bits if there are trailing zeros (unused components at front) - // and update offset. - DemandedElts = (1 << ActiveBits) - 1; - - if (UnusedComponentsAtFront > 0) { - static const unsigned InvalidOffsetIdx = 0xf; - - unsigned OffsetIdx; - switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_raw_buffer_load: - OffsetIdx = 1; - break; - case Intrinsic::amdgcn_s_buffer_load: - // If resulting type is vec3, there is no point in trimming the - // load with updated offset, as the vec3 would most likely be widened to - // vec4 anyway during lowering. - if (ActiveBits == 4 && UnusedComponentsAtFront == 1) - OffsetIdx = InvalidOffsetIdx; - else - OffsetIdx = 1; - break; - case Intrinsic::amdgcn_struct_buffer_load: - OffsetIdx = 2; - break; - default: - // TODO: handle tbuffer* intrinsics. - OffsetIdx = InvalidOffsetIdx; - break; - } - - if (OffsetIdx != InvalidOffsetIdx) { - // Clear demanded bits and update the offset. - DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); - auto *Offset = II->getArgOperand(OffsetIdx); - unsigned SingleComponentSizeInBits = - getDataLayout().getTypeSizeInBits(II->getType()->getScalarType()); - unsigned OffsetAdd = - UnusedComponentsAtFront * SingleComponentSizeInBits / 8; - auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); - Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal); - } - } - } else { - // Image case. - - ConstantInt *DMask = cast(II->getArgOperand(DMaskIdx)); - unsigned DMaskVal = DMask->getZExtValue() & 0xf; - - // Mask off values that are undefined because the dmask doesn't cover them - DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; - - unsigned NewDMaskVal = 0; - unsigned OrigLoadIdx = 0; - for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { - const unsigned Bit = 1 << SrcIdx; - if (!!(DMaskVal & Bit)) { - if (!!DemandedElts[OrigLoadIdx]) - NewDMaskVal |= Bit; - OrigLoadIdx++; - } - } - - if (DMaskVal != NewDMaskVal) - Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); - } - - unsigned NewNumElts = DemandedElts.countPopulation(); - if (!NewNumElts) - return UndefValue::get(II->getType()); - - if (NewNumElts >= VWidth && DemandedElts.isMask()) { - if (DMaskIdx >= 0) - II->setArgOperand(DMaskIdx, Args[DMaskIdx]); - return nullptr; - } - - // Validate function argument and return types, extracting overloaded types - // along the way. - SmallVector OverloadTys; - if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys)) - return nullptr; - - Module *M = II->getParent()->getParent()->getParent(); - Type *EltTy = IIVTy->getElementType(); - Type *NewTy = - (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); - - OverloadTys[0] = NewTy; - Function *NewIntrin = - Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys); - - CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); - NewCall->takeName(II); - NewCall->copyMetadata(*II); - - if (NewNumElts == 1) { - return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall, - DemandedElts.countTrailingZeros()); - } - - SmallVector EltMask; - unsigned NewLoadIdx = 0; - for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { - if (!!DemandedElts[OrigLoadIdx]) - EltMask.push_back(NewLoadIdx++); - else - EltMask.push_back(NewNumElts); - } - - Value *Shuffle = - Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); - - return Shuffle; -} - /// The specified value produces a vector with any number of elements. /// This method analyzes which elements of the operand are undef and returns /// that information in UndefElts. @@ -1182,10 +992,11 @@ /// If the information about demanded elements can be used to simplify the /// operation, the operation is simplified, then the resultant value is /// returned. This returns null if no change was made. -Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, - APInt &UndefElts, - unsigned Depth, - bool AllowMultipleUsers) { +Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, + APInt DemandedElts, + APInt &UndefElts, + unsigned Depth, + bool AllowMultipleUsers) { // Cannot analyze scalable type. The number of vector elements is not a // compile-time constant. if (isa(V->getType())) @@ -1292,12 +1103,12 @@ }; if (mayIndexStructType(cast(*I))) break; - + // Conservatively track the demanded elements back through any vector // operands we may have. We know there must be at least one, or we // wouldn't have a vector result to get here. Note that we intentionally // merge the undef bits here since gepping with either an undef base or - // index results in undef. + // index results in undef. for (unsigned i = 0; i < I->getNumOperands(); i++) { if (isa(I->getOperand(i))) { // If the entire vector is undefined, just return this info. @@ -1621,227 +1432,19 @@ if (II->getIntrinsicID() == Intrinsic::masked_gather) simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2); simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3); - + // Output elements are undefined if the element from both sources are. // TODO: can strengthen via mask as well. UndefElts = UndefElts2 & UndefElts3; break; } - case Intrinsic::x86_xop_vfrcz_ss: - case Intrinsic::x86_xop_vfrcz_sd: - // The instructions for these intrinsics are speced to zero upper bits not - // pass them through like other scalar intrinsics. So we shouldn't just - // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. - // Instead we should return a zero vector. - if (!DemandedElts[0]) { - Worklist.push(II); - return ConstantAggregateZero::get(II->getType()); - } - - // Only the lower element is used. - DemandedElts = 1; - simplifyAndSetOp(II, 0, DemandedElts, UndefElts); - - // Only the lower element is undefined. The high elements are zero. - UndefElts = UndefElts[0]; - break; - - // Unary scalar-as-vector operations that work column-wise. - case Intrinsic::x86_sse_rcp_ss: - case Intrinsic::x86_sse_rsqrt_ss: - simplifyAndSetOp(II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - Worklist.push(II); - return II->getArgOperand(0); - } - // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions - // checks). - break; - - // Binary scalar-as-vector operations that work column-wise. The high - // elements come from operand 0. The low element is a function of both - // operands. - case Intrinsic::x86_sse_min_ss: - case Intrinsic::x86_sse_max_ss: - case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse2_min_sd: - case Intrinsic::x86_sse2_max_sd: - case Intrinsic::x86_sse2_cmp_sd: { - simplifyAndSetOp(II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - Worklist.push(II); - return II->getArgOperand(0); - } - - // Only lower element is used for operand 1. - DemandedElts = 1; - simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); - - // Lower element is undefined if both lower elements are undefined. - // Consider things like undef&0. The result is known zero, not undef. - if (!UndefElts2[0]) - UndefElts.clearBit(0); - - break; - } - - // Binary scalar-as-vector operations that work column-wise. The high - // elements come from operand 0 and the low element comes from operand 1. - case Intrinsic::x86_sse41_round_ss: - case Intrinsic::x86_sse41_round_sd: { - // Don't use the low element of operand 0. - APInt DemandedElts2 = DemandedElts; - DemandedElts2.clearBit(0); - simplifyAndSetOp(II, 0, DemandedElts2, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - Worklist.push(II); - return II->getArgOperand(0); - } - - // Only lower element is used for operand 1. - DemandedElts = 1; - simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); - - // Take the high undef elements from operand 0 and take the lower element - // from operand 1. - UndefElts.clearBit(0); - UndefElts |= UndefElts2[0]; - break; - } - - // Three input scalar-as-vector operations that work column-wise. The high - // elements come from operand 0 and the low element is a function of all - // three inputs. - case Intrinsic::x86_avx512_mask_add_ss_round: - case Intrinsic::x86_avx512_mask_div_ss_round: - case Intrinsic::x86_avx512_mask_mul_ss_round: - case Intrinsic::x86_avx512_mask_sub_ss_round: - case Intrinsic::x86_avx512_mask_max_ss_round: - case Intrinsic::x86_avx512_mask_min_ss_round: - case Intrinsic::x86_avx512_mask_add_sd_round: - case Intrinsic::x86_avx512_mask_div_sd_round: - case Intrinsic::x86_avx512_mask_mul_sd_round: - case Intrinsic::x86_avx512_mask_sub_sd_round: - case Intrinsic::x86_avx512_mask_max_sd_round: - case Intrinsic::x86_avx512_mask_min_sd_round: - simplifyAndSetOp(II, 0, DemandedElts, UndefElts); - - // If lowest element of a scalar op isn't used then use Arg0. - if (!DemandedElts[0]) { - Worklist.push(II); - return II->getArgOperand(0); - } - - // Only lower element is used for operand 1 and 2. - DemandedElts = 1; - simplifyAndSetOp(II, 1, DemandedElts, UndefElts2); - simplifyAndSetOp(II, 2, DemandedElts, UndefElts3); - - // Lower element is undefined if all three lower elements are undefined. - // Consider things like undef&0. The result is known zero, not undef. - if (!UndefElts2[0] || !UndefElts3[0]) - UndefElts.clearBit(0); - - break; - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - case Intrinsic::x86_avx2_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx512_packssdw_512: - case Intrinsic::x86_avx512_packsswb_512: - case Intrinsic::x86_avx512_packusdw_512: - case Intrinsic::x86_avx512_packuswb_512: { - auto *Ty0 = II->getArgOperand(0)->getType(); - unsigned InnerVWidth = cast(Ty0)->getNumElements(); - assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); - - unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; - unsigned VWidthPerLane = VWidth / NumLanes; - unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; - - // Per lane, pack the elements of the first input and then the second. - // e.g. - // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) - // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) - for (int OpNum = 0; OpNum != 2; ++OpNum) { - APInt OpDemandedElts(InnerVWidth, 0); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - unsigned LaneIdx = Lane * VWidthPerLane; - for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { - unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; - if (DemandedElts[Idx]) - OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); - } - } - - // Demand elements from the operand. - APInt OpUndefElts(InnerVWidth, 0); - simplifyAndSetOp(II, OpNum, OpDemandedElts, OpUndefElts); - - // Pack the operand's UNDEF elements, one lane at a time. - OpUndefElts = OpUndefElts.zext(VWidth); - for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); - LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); - LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); - UndefElts |= LaneElts; - } - } - break; - } - - // PSHUFB - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - case Intrinsic::x86_avx512_pshuf_b_512: - // PERMILVAR - case Intrinsic::x86_avx_vpermilvar_ps: - case Intrinsic::x86_avx_vpermilvar_ps_256: - case Intrinsic::x86_avx512_vpermilvar_ps_512: - case Intrinsic::x86_avx_vpermilvar_pd: - case Intrinsic::x86_avx_vpermilvar_pd_256: - case Intrinsic::x86_avx512_vpermilvar_pd_512: - // PERMV - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: { - simplifyAndSetOp(II, 1, DemandedElts, UndefElts); - break; - } - - // SSE4A instructions leave the upper 64-bits of the 128-bit result - // in an undefined state. - case Intrinsic::x86_sse4a_extrq: - case Intrinsic::x86_sse4a_extrqi: - case Intrinsic::x86_sse4a_insertq: - case Intrinsic::x86_sse4a_insertqi: - UndefElts.setHighBits(VWidth / 2); - break; - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: - case Intrinsic::amdgcn_raw_buffer_load: - case Intrinsic::amdgcn_raw_buffer_load_format: - case Intrinsic::amdgcn_raw_tbuffer_load: - case Intrinsic::amdgcn_s_buffer_load: - case Intrinsic::amdgcn_struct_buffer_load: - case Intrinsic::amdgcn_struct_buffer_load_format: - case Intrinsic::amdgcn_struct_tbuffer_load: - case Intrinsic::amdgcn_tbuffer_load: - return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts); default: { - if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) - return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0); - + // Handle target specific intrinsics + Optional V = targetSimplifyDemandedVectorEltsIntrinsic( + *II, DemandedElts, UndefElts, UndefElts2, UndefElts3, + simplifyAndSetOp); + if (V.hasValue()) + return V.getValue(); break; } } // switch on IntrinsicID diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -35,6 +35,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include #include #include @@ -85,7 +86,8 @@ // If we have a PHI node with a vector type that is only used to feed // itself and be an operand of extractelement at a constant location, // try to replace the PHI of the vector type with a PHI of a scalar type. -Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { +Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI, + PHINode *PN) { SmallVector Extracts; // The users we want the PHI to have are: // 1) The EI ExtractElement (we already know this) @@ -321,7 +323,7 @@ return UnionUsedElts; } -Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { +Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { Value *SrcVec = EI.getVectorOperand(); Value *Index = EI.getIndexOperand(); if (Value *V = SimplifyExtractElementInst(SrcVec, Index, @@ -531,7 +533,7 @@ /// shufflevector to replace one or more insert/extract pairs. static void replaceExtractElements(InsertElementInst *InsElt, ExtractElementInst *ExtElt, - InstCombiner &IC) { + InstCombinerImpl &IC) { VectorType *InsVecType = InsElt->getType(); VectorType *ExtVecType = ExtElt->getVectorOperandType(); unsigned NumInsElts = InsVecType->getNumElements(); @@ -614,7 +616,7 @@ static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl &Mask, Value *PermittedRHS, - InstCombiner &IC) { + InstCombinerImpl &IC) { assert(V->getType()->isVectorTy() && "Invalid shuffle!"); unsigned NumElts = cast(V->getType())->getNumElements(); @@ -699,7 +701,7 @@ /// first one, making the first one redundant. /// It should be transformed to: /// %0 = insertvalue { i8, i32 } undef, i8 %y, 0 -Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) { +Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) { bool IsRedundant = false; ArrayRef FirstIndices = I.getIndices(); @@ -1041,7 +1043,7 @@ return nullptr; } -Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { +Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) { Value *VecOp = IE.getOperand(0); Value *ScalarOp = IE.getOperand(1); Value *IdxOp = IE.getOperand(2); @@ -1525,7 +1527,7 @@ is_contained(Mask, UndefMaskElem) && (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode)); if (MightCreatePoisonOrUB) - NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true); + NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true); // shuf (bop X, C), X, M --> bop X, C' // shuf X, (bop X, C), M --> bop X, C' @@ -1652,7 +1654,8 @@ is_contained(Mask, UndefMaskElem) && (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc)); if (MightCreatePoisonOrUB) - NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1); + NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC, + ConstantsAreOp1); Value *V; if (X == Y) { @@ -1823,7 +1826,7 @@ /// Try to replace a shuffle with an insertelement or try to replace a shuffle /// operand with the operand of an insertelement. static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf, - InstCombiner &IC) { + InstCombinerImpl &IC) { Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1); SmallVector Mask; Shuf.getShuffleMask(Mask); @@ -1974,7 +1977,7 @@ return new ShuffleVectorInst(X, Y, NewMask); } -Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { +Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -59,6 +59,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/BasicBlock.h" @@ -160,7 +161,41 @@ static cl::opt ShouldLowerDbgDeclare("instcombine-lower-dbg-declare", cl::Hidden, cl::init(true)); -Value *InstCombiner::EmitGEPOffset(User *GEP) { +Optional +InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) { + // Handle target specific intrinsics + if (II.getCalledFunction()->isTargetIntrinsic()) { + return TTI.instCombineIntrinsic(*this, II); + } + return None; +} + +Optional InstCombiner::targetSimplifyDemandedUseBitsIntrinsic( + IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, + bool &KnownBitsComputed) { + // Handle target specific intrinsics + if (II.getCalledFunction()->isTargetIntrinsic()) { + return TTI.simplifyDemandedUseBitsIntrinsic(*this, II, DemandedMask, Known, + KnownBitsComputed); + } + return None; +} + +Optional InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic( + IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, + APInt &UndefElts3, + std::function + SimplifyAndSetOp) { + // Handle target specific intrinsics + if (II.getCalledFunction()->isTargetIntrinsic()) { + return TTI.simplifyDemandedVectorEltsIntrinsic( + *this, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, + SimplifyAndSetOp); + } + return None; +} + +Value *InstCombinerImpl::EmitGEPOffset(User *GEP) { return llvm::EmitGEPOffset(&Builder, DL, GEP); } @@ -173,8 +208,8 @@ /// legal to convert to, in order to open up more combining opportunities. /// NOTE: this treats i8, i16 and i32 specially, due to them being so common /// from frontend languages. -bool InstCombiner::shouldChangeType(unsigned FromWidth, - unsigned ToWidth) const { +bool InstCombinerImpl::shouldChangeType(unsigned FromWidth, + unsigned ToWidth) const { bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth); bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth); @@ -201,7 +236,7 @@ /// to a larger illegal type. i1 is always treated as a legal type because it is /// a fundamental type in IR, and there are many specialized optimizations for /// i1 types. -bool InstCombiner::shouldChangeType(Type *From, Type *To) const { +bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const { // TODO: This could be extended to allow vectors. Datalayout changes might be // needed to properly support that. if (!From->isIntegerTy() || !To->isIntegerTy()) @@ -269,7 +304,8 @@ /// cast to eliminate one of the associative operations: /// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2))) /// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2)) -static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) { +static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, + InstCombinerImpl &IC) { auto *Cast = dyn_cast(BinOp1->getOperand(0)); if (!Cast || !Cast->hasOneUse()) return false; @@ -327,7 +363,7 @@ /// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. /// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" /// if C1 and C2 are constants. -bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { +bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Instruction::BinaryOps Opcode = I.getOpcode(); bool Changed = false; @@ -555,9 +591,10 @@ /// This tries to simplify binary operations by factorizing out common terms /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)"). -Value *InstCombiner::tryFactorization(BinaryOperator &I, - Instruction::BinaryOps InnerOpcode, - Value *A, Value *B, Value *C, Value *D) { +Value *InstCombinerImpl::tryFactorization(BinaryOperator &I, + Instruction::BinaryOps InnerOpcode, + Value *A, Value *B, Value *C, + Value *D) { assert(A && B && C && D && "All values must be provided"); Value *V = nullptr; @@ -660,7 +697,7 @@ /// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in /// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win). /// Returns the simplified value, or null if it didn't simplify. -Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { +Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); BinaryOperator *Op0 = dyn_cast(LHS); BinaryOperator *Op1 = dyn_cast(RHS); @@ -774,8 +811,9 @@ return SimplifySelectsFeedingBinaryOp(I, LHS, RHS); } -Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I, - Value *LHS, Value *RHS) { +Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I, + Value *LHS, + Value *RHS) { Value *A, *B, *C, *D, *E, *F; bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))); bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F))); @@ -827,7 +865,7 @@ /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a /// constant zero (which is the 'negate' form). -Value *InstCombiner::dyn_castNegVal(Value *V) const { +Value *InstCombinerImpl::dyn_castNegVal(Value *V) const { Value *NegV; if (match(V, m_Neg(m_Value(NegV)))) return NegV; @@ -888,7 +926,8 @@ return RI; } -Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { +Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, + SelectInst *SI) { // Don't modify shared select instructions. if (!SI->hasOneUse()) return nullptr; @@ -983,7 +1022,7 @@ return RI; } -Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) { +Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) { unsigned NumPHIValues = PN->getNumIncomingValues(); if (NumPHIValues == 0) return nullptr; @@ -1125,7 +1164,7 @@ return replaceInstUsesWith(I, NewPN); } -Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { +Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { if (!isa(I.getOperand(1))) return nullptr; @@ -1143,8 +1182,9 @@ /// is a sequence of GEP indices into the pointed type that will land us at the /// specified offset. If so, fill them into NewIndices and return the resultant /// element type, otherwise return null. -Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset, - SmallVectorImpl &NewIndices) { +Type * +InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t Offset, + SmallVectorImpl &NewIndices) { Type *Ty = PtrTy->getElementType(); if (!Ty->isSized()) return nullptr; @@ -1213,7 +1253,7 @@ /// Return a value X such that Val = X * Scale, or null if none. /// If the multiplication is known not to overflow, then NoSignedWrap is set. -Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { +Value *InstCombinerImpl::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { assert(isa(Val->getType()) && "Can only descale integers!"); assert(cast(Val->getType())->getBitWidth() == Scale.getBitWidth() && "Scale not compatible with value!"); @@ -1453,7 +1493,7 @@ } while (true); } -Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) { +Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { // FIXME: some of this is likely fine for scalable vectors if (!isa(Inst.getType())) return nullptr; @@ -1675,7 +1715,7 @@ /// Try to narrow the width of a binop if at least 1 operand is an extend of /// of a value. This requires a potentially expensive known bits check to make /// sure the narrow op does not overflow. -Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) { +Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) { // We need at least one extended operand. Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1); @@ -1764,7 +1804,7 @@ return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel); } -Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { +Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector Ops(GEP.op_begin(), GEP.op_end()); Type *GEPType = GEP.getType(); Type *GEPEltType = GEP.getSourceElementType(); @@ -2516,7 +2556,7 @@ return true; } -Instruction *InstCombiner::visitAllocSite(Instruction &MI) { +Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { // If we have a malloc call which is only used in any amount of comparisons to // null and free calls, delete the calls and replace the comparisons with true // or false as appropriate. @@ -2675,7 +2715,7 @@ return &FI; } -Instruction *InstCombiner::visitFree(CallInst &FI) { +Instruction *InstCombinerImpl::visitFree(CallInst &FI) { Value *Op = FI.getArgOperand(0); // free undef -> unreachable. @@ -2716,7 +2756,7 @@ return false; } -Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { +Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) { if (RI.getNumOperands() == 0) // ret void return nullptr; @@ -2739,7 +2779,7 @@ return nullptr; } -Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) { +Instruction *InstCombinerImpl::visitUnconditionalBranchInst(BranchInst &BI) { assert(BI.isUnconditional() && "Only for unconditional branches."); // If this store is the second-to-last instruction in the basic block @@ -2768,7 +2808,7 @@ return nullptr; } -Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { +Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) { if (BI.isUnconditional()) return visitUnconditionalBranchInst(BI); @@ -2804,7 +2844,7 @@ return nullptr; } -Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { +Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) { Value *Cond = SI.getCondition(); Value *Op0; ConstantInt *AddRHS; @@ -2835,7 +2875,7 @@ unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes); // Shrink the condition operand if the new type is smaller than the old type. - // But do not shrink to a non-standard type, because backend can't generate + // But do not shrink to a non-standard type, because backend can't generate // good code for that yet. // TODO: We can make it aggressive again after fixing PR39569. if (NewWidth > 0 && NewWidth < Known.getBitWidth() && @@ -2854,7 +2894,7 @@ return nullptr; } -Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { +Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) { Value *Agg = EV.getAggregateOperand(); if (!EV.hasIndices()) @@ -3015,7 +3055,7 @@ cast(RHS->getType())->getNumElements(); } -Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { +Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) { // The logic here should be correct for any real-world personality function. // However if that turns out not to be true, the offending logic can always // be conditioned on the personality function, like the catch-all logic is. @@ -3324,7 +3364,7 @@ return nullptr; } -Instruction *InstCombiner::visitFreeze(FreezeInst &I) { +Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) { Value *Op0 = I.getOperand(0); if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I))) @@ -3435,7 +3475,7 @@ return true; } -bool InstCombiner::run() { +bool InstCombinerImpl::run() { while (!Worklist.isEmpty()) { // Walk deferred instructions in reverse order, and push them to the // worklist, which means they'll end up popped from the worklist in-order. @@ -3718,8 +3758,8 @@ static bool combineInstructionsOverFunction( Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, - AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, - OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI, + DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) { auto &DL = F.getParent()->getDataLayout(); MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue()); @@ -3763,8 +3803,8 @@ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); - InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA, - AC, TLI, DT, ORE, BFI, PSI, DL, LI); + InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT, + ORE, BFI, PSI, DL, LI); IC.MaxArraySizeForCombine = MaxArraySize; if (!IC.run()) @@ -3787,6 +3827,7 @@ auto &DT = AM.getResult(F); auto &TLI = AM.getResult(F); auto &ORE = AM.getResult(F); + auto &TTI = AM.getResult(F); auto *LI = AM.getCachedResult(F); @@ -3797,8 +3838,8 @@ auto *BFI = (PSI && PSI->hasProfileSummary()) ? &AM.getResult(F) : nullptr; - if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, - PSI, MaxIterations, LI)) + if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE, + BFI, PSI, MaxIterations, LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3816,6 +3857,7 @@ AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -3834,6 +3876,7 @@ auto AA = &getAnalysis().getAAResults(); auto &AC = getAnalysis().getAssumptionCache(F); auto &TLI = getAnalysis().getTLI(F); + auto &TTI = getAnalysis().getTTI(F); auto &DT = getAnalysis().getDomTree(); auto &ORE = getAnalysis().getORE(); @@ -3847,8 +3890,8 @@ &getAnalysis().getBFI() : nullptr; - return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, - PSI, MaxIterations, LI); + return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE, + BFI, PSI, MaxIterations, LI); } char InstructionCombiningPass::ID = 0; @@ -3867,6 +3910,7 @@ "Combine redundant instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s +; RUN: opt -instcombine -mtriple=thumbv8.1m.main %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s declare <16 x i1> @llvm.arm.mve.vctp8(i32) declare <8 x i1> @llvm.arm.mve.vctp16(i32) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: opt -instcombine -S %s | FileCheck --check-prefix=IR %s -; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s +; RUN: opt -instcombine -mtriple=thumbv8.1m.main -S %s | FileCheck --check-prefix=IR %s +; RUN: opt -instcombine -mtriple=thumbv8.1m.main %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s %struct.foo = type { [2 x <4 x i32>] } diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll --- a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs -o - | FileCheck %s +; RUN: opt -instcombine -mtriple=thumbv8.1m.main-none-eabi %s | llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs -o - | FileCheck %s target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" @@ -42,4 +42,3 @@ declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) declare <8 x i1> @llvm.arm.mve.vctp16(i32) - diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -instcombine %s | FileCheck %s +; RUN: opt -S -instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s ; -------------------------------------------------------------------- ; llvm.amdgcn.buffer.load diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -instcombine -S < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -instcombine -S < %s | FileCheck %s ; -------------------------------------------------------------------- ; llvm.amdgcn.rcp diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/ldexp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s define float @ldexp_f32_undef_undef() { ; CHECK-LABEL: @ldexp_f32_undef_undef( diff --git a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll --- a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll +++ b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -instcombine -S -o - %s | FileCheck %s +; RUN: opt -instcombine -S -mtriple=arm -o - %s | FileCheck %s target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll --- a/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/ARM/neon-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=arm -S | FileCheck %s ; The alignment arguments for NEON load/store intrinsics can be increased ; by instcombine. Check for this. diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll --- a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll +++ b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll @@ -6,11 +6,11 @@ ; RUN: cat %s > %t.ftz ; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }' >> %t.ftz -; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ +; RUN: opt < %t.ftz -instcombine -mtriple=nvptx64-nvidia-cuda -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ ; RUN: cat %s > %t.noftz ; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "ieee" }' >> %t.noftz -; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ +; RUN: opt < %t.noftz -instcombine -mtriple=nvptx64-nvidia-cuda -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ ; We handle nvvm intrinsics with ftz variants as follows: ; - If the module is in ftz mode, the ftz variant is transformed into the diff --git a/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll b/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll --- a/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll +++ b/llvm/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s ; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b). diff --git a/llvm/test/Transforms/InstCombine/X86/addcarry.ll b/llvm/test/Transforms/InstCombine/X86/addcarry.ll --- a/llvm/test/Transforms/InstCombine/X86/addcarry.ll +++ b/llvm/test/Transforms/InstCombine/X86/addcarry.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32) declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64) @@ -35,4 +35,3 @@ %r = extractvalue { i8, i64 } %s, 1 ret i64 %r } - diff --git a/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll b/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll --- a/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll +++ b/llvm/test/Transforms/InstCombine/X86/clmulqdq.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Verify that instcombine is able to fold identity shuffles. diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) diff --git a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) nounwind readnone declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone diff --git a/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll b/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-insertps.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone diff --git a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s ;; MASKED LOADS @@ -325,4 +325,3 @@ declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) - diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pack.ll b/llvm/test/Transforms/InstCombine/X86/x86-pack.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-pack.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-pack.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s ; ; UNDEF Elts diff --git a/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll b/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-pshufb.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s ; Verify that instcombine is able to fold identity shuffles. diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define float @test_rcp_ss_0(float %a) { diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define double @test_sqrt_sd_0(double %a) { diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse41.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) { diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse4a.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s ; ; EXTRQ diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define i16 @test1(float %f) { diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll b/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vpermil.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Verify that instcombine is able to fold identity shuffles. diff --git a/llvm/test/Transforms/InstCombine/X86/x86-xop.ll b/llvm/test/Transforms/InstCombine/X86/x86-xop.ll --- a/llvm/test/Transforms/InstCombine/X86/x86-xop.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-xop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s +; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s define <2 x double> @test_vfrcz_sd(<2 x double> %a) { ; CHECK-LABEL: @test_vfrcz_sd(