Index: llvm/include/llvm/Transforms/InstCombine/InstCombiner.h =================================================================== --- llvm/include/llvm/Transforms/InstCombine/InstCombiner.h +++ llvm/include/llvm/Transforms/InstCombine/InstCombiner.h @@ -44,9 +44,10 @@ /// combine them. class LLVM_LIBRARY_VISIBILITY InstCombiner { /// Only used to call target specific inst combining. - TargetTransformInfo &TTI; public: + TargetTransformInfo &TTI; + /// Maximum size of array considered when transforming. uint64_t MaxArraySizeForCombine = 0; Index: llvm/lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -188,6 +188,8 @@ LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy, const Twine &Suffix = ""); + bool isFreeExt(Instruction *ExtI); + bool canFoldLoadIntoPHI(PHINode &PN, ArrayRef IncLIs, Align &LoadAlignment); private: void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI); Index: llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -20,6 +20,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/InstructionCost.h" +#include "llvm/Analysis/TargetTransformInfo.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -655,13 +657,16 @@ return true; } -Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { - LoadInst *FirstLI = cast(PN.getIncomingValue(0)); - +// Returns TRUE if foldPHIArgLoadIntoPHI can fold the IncLIs Loads as arguments +// into the PHI node PN. If TRUE it will write the required Alignment for +// the new Load into LoadAlignment. +bool InstCombinerImpl::canFoldLoadIntoPHI(PHINode &PN, ArrayRef IncLIs, + Align &LoadAlignment) { + auto FirstLI = dyn_cast(IncLIs[0]); // FIXME: This is overconservative; this transform is allowed in some cases // for atomic operations. if (FirstLI->isAtomic()) - return nullptr; + return false; // When processing loads, we need to propagate two bits of information to the // sunk load: whether it is volatile, and what its alignment is. We currently @@ -669,27 +674,27 @@ // visitLoadInst will propagate an alignment onto the load when TD is around, // and if TD isn't around, we can't handle the mixed case. bool isVolatile = FirstLI->isVolatile(); - Align LoadAlignment = FirstLI->getAlign(); + LoadAlignment = FirstLI->getAlign(); unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace(); // We can't sink the load if the loaded value could be modified between the // load and the PHI. if (FirstLI->getParent() != PN.getIncomingBlock(0) || !isSafeAndProfitableToSinkLoad(FirstLI)) - return nullptr; + return false; // If the PHI is of volatile loads and the load block has multiple // successors, sinking it would remove a load of the volatile value from // the path through the other successor. if (isVolatile && FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1) - return nullptr; + return false; // Check to see if all arguments are the same operation. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { - LoadInst *LI = dyn_cast(PN.getIncomingValue(i)); + auto LI = dyn_cast(IncLIs[i]); if (!LI || !LI->hasOneUser()) - return nullptr; + return false; // We can't sink the load if the loaded value could be modified between // the load and the PHI. @@ -697,24 +702,36 @@ LI->getParent() != PN.getIncomingBlock(i) || LI->getPointerAddressSpace() != LoadAddrSpace || !isSafeAndProfitableToSinkLoad(LI)) - return nullptr; + return false; LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign())); // If the PHI is of volatile loads and the load block has multiple // successors, sinking it would remove a load of the volatile value from // the path through the other successor. - if (isVolatile && - LI->getParent()->getTerminator()->getNumSuccessors() != 1) - return nullptr; + if (isVolatile && LI->getParent()->getTerminator()->getNumSuccessors() != 1) + return false; } + return true; +} + +Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { + Align LoadAlignment; + std::vector IncLIs; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + IncLIs.push_back(PN.getIncomingValue(i)); + if (!canFoldLoadIntoPHI(PN, ArrayRef(IncLIs), LoadAlignment)) + return nullptr; + + auto FirstLI = cast (PN.getIncomingValue(0)); // Okay, they are all the same operation. Create a new PHI node of the // correct type, and PHI together all of the LHS's of the instructions. PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(), PN.getNumIncomingValues(), PN.getName()+".in"); + bool isVolatile = FirstLI->isVolatile(); Value *InVal = FirstLI->getOperand(0); NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); LoadInst *NewLI = @@ -840,6 +857,39 @@ return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); } +// Return true if I is a free extension. I.e. check whether the target can fold +// the SExt or ZExt into the previous operation, currently only checks for +// loads. Returns false otherwise. +bool InstCombinerImpl::isFreeExt(Instruction *ExtI) { + // Check whether I is an Ext instruction. + if (ExtI->getOpcode() != Instruction::SExt && + ExtI->getOpcode() != Instruction::ZExt) + return false; + + // If it's not an Instruction it can't be a load. + auto *LoadI = dyn_cast(ExtI->getOperand(0)); + if (!LoadI) + return false; + + // Check whether we are dealing with a normal or a masked load. + bool masked = false; + if (isa(LoadI)) { + if (cast(LoadI)->getIntrinsicID() != Intrinsic::masked_load) + return false; + masked = true; + } else if (LoadI->getOpcode() != Instruction::Load) + return false; + + // Check whether the target can merge the load and extend instructions. + InstructionCost Cost = TTI.getCastInstrCost( + ExtI->getOpcode(), ExtI->getType(), LoadI->getType(), + masked ? TTI::CastContextHint::Masked : TTI::CastContextHint::Normal, + TTI::TCK_SizeAndLatency, ExtI); + + // Check whether the Cost of the Ext is valid and indeed 0, i.e. free + return Cost.isValid() && Cost == 0; +} + /// If all operands to a PHI node are the same "unary" operator and they all are /// only used by the PHI, PHI together their inputs, and do the operation once, /// to the result of the PHI. @@ -887,19 +937,47 @@ return nullptr; // Cannot fold this operation. } + // We want to try to keep (SZ)Exts and Loads together when the Ext is free. + // OneIsFree is to check whether any of the Ext's feeding into the PHInode + // are free. AllAreFree check whether all of the Ext's are free. + bool OneIsFree = isFreeExt(FirstInst); + bool AllAreFree = OneIsFree; + std::vector IncLIs; + + if (OneIsFree) + IncLIs.push_back (FirstInst->getOperand (0)); + // Check to see if all arguments are the same operation. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { Instruction *I = dyn_cast(PN.getIncomingValue(i)); if (!I || !I->hasOneUser() || !I->isSameOperationAs(FirstInst)) return nullptr; + + if (isFreeExt(I)) { + OneIsFree = true; + IncLIs.push_back(I->getOperand(0)); + } else + AllAreFree = false; + if (CastSrcTy) { if (I->getOperand(0)->getType() != CastSrcTy) - return nullptr; // Cast operation must match. + return nullptr; // Cast operation must match. } else if (I->getOperand(1) != ConstantOp) { return nullptr; } } + // If AllAreFree and we know we can't sink the Loads, then we should not sink + // the Exts either, if all are free, sinkable and the Loads can be sunk too, + // then we will want it to sink both Loads and Exts. + // If at least OneIsFree and we are not optimizing for size then we do not + // sink the Ext, since that would penalize the performance of the path(s) + // of the free Ext(s). + Align dummy; + if ((AllAreFree && !canFoldLoadIntoPHI(PN, IncLIs, dummy)) || + ((!AllAreFree && OneIsFree && !MinimizeSize))) + return nullptr; + // Okay, they are all the same operation. Create a new PHI node of the // correct type, and PHI together all of the LHS's of the instructions. PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(), Index: llvm/test/Transforms/InstCombine/AArch64/load_extend0.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/AArch64/load_extend0.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local i32 @foo(i8* %ip) local_unnamed_addr { +; CHECK-LABEL: @foo( +entry: + %0 = load i8, i8* %ip, align 1 + %conv = zext i8 %0 to i64 + br label %do.body + +do.body: ; preds = %do.body, %entry +; CHECK-LABEL: do.body: +; CHECK-NEXT: %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ] + %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ] + %arrayidx = getelementptr inbounds i8, i8* %ip, i64 %1 + %2 = load i8, i8* %arrayidx, align 1 + %conv1 = zext i8 %2 to i64 + %cmp = icmp ult i64 %conv1, 100 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + %conv3 = trunc i64 %conv1 to i32 + ret i32 %conv3 +} + +define dso_local i64 @bar(i8* %ip) local_unnamed_addr { +; CHECK-LABEL: @bar( +entry: + %0 = load i8, i8* %ip, align 1 + %conv = zext i8 %0 to i64 + br label %do.body + +do.body: ; preds = %do.body, %entry +; CHECK-LABEL: do.body: +; CHECK-NEXT: %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ] + %1 = phi i64 [ %conv, %entry ], [ %conv1, %do.body ] + %arrayidx = getelementptr inbounds i8, i8* %ip, i64 %1 + %2 = load i8, i8* %arrayidx, align 1 + call void @foobar(i8 %2) + %conv1 = zext i8 %2 to i64 + %cmp = icmp ult i64 %conv1, 100 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret i64 %conv1 +} + +declare dso_local void @foobar(i8) local_unnamed_addr Index: llvm/test/Transforms/InstCombine/ARM/load_ext_combine.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/ARM/load_ext_combine.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-none-unknown-eabihf" + +; Function Attrs: nounwind +define dso_local void @foo(i32* %a, i32* %b, i32 %n) local_unnamed_addr { +; CHECK-LABEL: @foo( +entry: + %0 = load i32, i32* %a, align 4 + %conv = zext i32 %0 to i64 + %1 = load i32, i32* %b, align 4 + %conv2 = zext i32 %1 to i64 + %sub = sub nsw i32 %n, 1 + br label %do.body + +do.body: ; preds = %do.body, %entry +; CHECK-LABEL: do.body: +; CHECK-NEXT: %i0.0.in = phi i32 [ %0, %entry ], [ %2, %do.body ] + %i0.0 = phi i64 [ %conv, %entry ], [ %conv3, %do.body ] + %n.addr.0 = phi i32 [ %sub, %entry ], [ %sub7, %do.body ] + %a.addr.0 = phi i32* [ %a, %entry ], [ %add.ptr6, %do.body ] + %mul = mul i64 %i0.0, %conv2 + %add.ptr = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + %2 = load i32, i32* %add.ptr, align 4 + %conv3 = zext i32 %2 to i64 + %conv4 = trunc i64 %mul to i32 + store i32 %conv4, i32* %a.addr.0, align 4 + %add.ptr6 = getelementptr inbounds i32, i32* %a.addr.0, i32 8 + %sub7 = sub nsw i32 %n.addr.0, 8 + %cmp = icmp sgt i32 %sub7, 0 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret void +} Index: llvm/test/Transforms/InstCombine/ARM/mve_ext_mask_loads.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/ARM/mve_ext_mask_loads.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -instcombine -S -mattr=+mve | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-none-unknown-eabihf" + +define dso_local void @foo(i8* %a, i8* %b, i32 %n) local_unnamed_addr { +; CHECK-LABEL: @foo( +entry: + %0 = bitcast i8* %a to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %b to <8 x i8>* + %4 = load <8 x i8>, <8 x i8>* %3, align 1 + %5 = zext <8 x i8> %4 to <8 x i16> + %add.ptr = getelementptr inbounds i8, i8* %b, i32 8 + br label %do.body + +do.body: ; preds = %do.body, %entry +; CHECK-LABEL: do.body: +; CHECK-NEXT: %v0.0 = phi <8 x i16> [ %2, %entry ], [ %10, %do.body ] + %v0.0 = phi <8 x i16> [ %2, %entry ], [ %14, %do.body ] + %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ] + %b.addr.0 = phi i8* [ %add.ptr, %entry ], [ %add.ptr2, %do.body ] + %a.addr.0 = phi i8* [ %a, %entry ], [ %add.ptr1, %do.body ] + %6 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.0) + %7 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %6) + %8 = trunc i32 %7 to i16 + %9 = zext i16 %8 to i32 + %10 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %9) + %11 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %v0.0, <8 x i16> %5, <8 x i1> %10, <8 x i16> undef) + %add.ptr1 = getelementptr inbounds i8, i8* %a.addr.0, i32 8 + %12 = bitcast i8* %add.ptr1 to <8 x i8>* + %13 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %12, i32 1, <8 x i1> %10, <8 x i8> zeroinitializer) + %14 = zext <8 x i8> %13 to <8 x i16> + %15 = trunc <8 x i16> %11 to <8 x i8> + %16 = bitcast i8* %a.addr.0 to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %15, <8 x i8>* %16, i32 1, <8 x i1> %10) + %add.ptr2 = getelementptr inbounds i8, i8* %b.addr.0, i32 8 + %sub = sub nsw i32 %n.addr.0, 8 + %cmp = icmp sgt i32 %sub, 0 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret void +} + +; Function Attrs: nofree nosync nounwind readnone +declare <8 x i1> @llvm.arm.mve.vctp16(i32) + +; Function Attrs: nofree nosync nounwind readnone +declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>) + +; Function Attrs: nofree nosync nounwind readnone +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +; Function Attrs: nofree nosync nounwind readnone +declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) + +; Function Attrs: argmemonly mustprogress nofree nosync nounwind readonly willreturn +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) + +; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn writeonly +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>)