Index: llvm/lib/CodeGen/TypePromotion.cpp =================================================================== --- llvm/lib/CodeGen/TypePromotion.cpp +++ llvm/lib/CodeGen/TypePromotion.cpp @@ -143,9 +143,9 @@ unsigned TypeSize = 0; LLVMContext *Ctx = nullptr; unsigned RegisterBitWidth = 0; - SmallPtrSet AllVisited; - SmallPtrSet SafeToPromote; - SmallVector SafeWrap; + SmallPtrSet AllVisited; + SmallPtrSet SafeToPromote; + SmallVector SafeWrap; // Does V have the same size result type as TypeSize. bool EqualTypeSize(Value *V); @@ -174,6 +174,13 @@ // wrapping. bool isLegalToPromote(Value *V); bool TryToPromote(Value *V, unsigned PromotedWidth); + bool TryToPromoteICmp(ICmpInst *ICmpInst, const TargetLowering *TLI, + const DataLayout &DL); + bool TryToPromotePHI(Instruction *Ext, PHINode *PHI, + const TargetLowering *TLI, const DataLayout &DL, + const TargetTransformInfo &TTI, + SmallVectorImpl &ToRemove, + SmallPtrSetImpl &VisitedPHIs); public: static char ID; @@ -908,6 +915,238 @@ return true; } +bool TypePromotion::TryToPromoteICmp(ICmpInst *ICmp, const TargetLowering *TLI, + const DataLayout &DL) { + + LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: " << *ICmp << "\n"); + + for (auto &Op : ICmp->operands()) { + if (auto *I = dyn_cast(Op)) { + EVT SrcVT = TLI->getValueType(DL, I->getType()); + if (SrcVT.isSimple() && TLI->isTypeLegal(SrcVT.getSimpleVT())) + return false; + + if (TLI->getTypeAction(*Ctx, SrcVT) != TargetLowering::TypePromoteInteger) + return false; + EVT PromotedVT = TLI->getTypeToTransformTo(*Ctx, SrcVT); + if (RegisterBitWidth < PromotedVT.getFixedSizeInBits()) { + LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target register " + << "for promoted type\n"); + return false; + } + + return TryToPromote(I, PromotedVT.getFixedSizeInBits()); + } + } + return false; +} + +// The goal of this optimization is to move (SZ)Exts next to Loads where they +// are free. We look for a PHI-node with a single-use by a (SZ)Ext and look +// for incoming values that are Loads. +bool TypePromotion::TryToPromotePHI(Instruction *ExtI, PHINode *PHI, + const TargetLowering *TLI, + const DataLayout &DL, + const TargetTransformInfo &TTI, + SmallVectorImpl &ToRemove, + SmallPtrSetImpl &VisitedPHIs) { + + EVT ExtVT = TLI->getValueType(DL, ExtI->getType()); + if (!ExtVT.isVector() && RegisterBitWidth < ExtVT.getFixedSizeInBits()) { + LLVM_DEBUG(dbgs() << "IR Promotion: extension type does not fit into " + << "single register\n"); + return false; + } + + bool Transform = true; + SmallVector Worklist; + SmallPtrSet SeenPHIs; + Worklist.push_back(PHI); + // Starting from the current PHI node 'PHI', check whether all incoming + // values are free to Extend. I.e. that they are either: + // - Loads that can be turned into widening loads without extra cost + // - Constants, which we can just extend at compile time + while (!Worklist.empty()) { + PHINode *CurrentPHI = Worklist.pop_back_val(); + if (SeenPHIs.contains(CurrentPHI)) + continue; + VisitedPHIs.insert(CurrentPHI); + for (Value *IncValue : CurrentPHI->incoming_values()) { + if (isa(IncValue)) { + auto *LoadI = cast(IncValue); + bool masked = false; + if (isa(LoadI)) { + Intrinsic::ID Id = cast(LoadI)->getIntrinsicID(); + if (Id != Intrinsic::masked_load) { + Transform = false; + break; + } + masked = true; + } else if (LoadI->getOpcode() != Instruction::Load) { + Transform = false; + break; + } + // Check whether the target can merge the Load and Extend instructions. + InstructionCost Cost = TTI.getCastInstrCost( + ExtI->getOpcode(), ExtI->getType(), LoadI->getType(), + masked ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal, + TTI::TCK_SizeAndLatency, ExtI); + if (!Cost.isValid() || Cost != 0) { + Transform = false; + break; + } + + // So far so good, now we need to make sure the other uses of the Load, + // if any, can also be promoted. + if (!LoadI->hasOneUse()) { + for (Use &U : LoadI->uses()) { + // This is the use of the PHI we are currently looking at. + if (U.getUser() == CurrentPHI) + continue; + if (isa(U.getUser())) { + auto *UseI = cast(U.getUser()); + + // We can just remove this Ext when promoting this PHI. + if (UseI->getOpcode() == ExtI->getOpcode()) + continue; + + // We can change this ICmp after pomoting this PHI. + if (isa(U.getUser())) { + auto CmpI = cast(U.getUser()); + if (isa(CmpI->getOperand(0)) || + isa(CmpI->getOperand(1))) + continue; + } + if (auto *UsedPHI = dyn_cast(U.getUser())) { + if (UsedPHI->hasOneUse()) { + // We've seen this UsedPHI and deemed it was OK to promote it + if (SeenPHIs.contains(UsedPHI)) + continue; + + // Otherwise if this PHI is also followed by the same kind of + // Ext with the same type, then add it to the worklist to see + // if we can promote it too. If we can't we have to fail the + // entire transformation. + if (auto UsedExt = dyn_cast( + UsedPHI->getUniqueUndroppableUser())) { + if (UsedExt->getOpcode() == ExtI->getOpcode()) { + EVT UsedVT = TLI->getValueType(DL, UsedExt->getType()); + // Same Ext type so we can check this PHI for promotion + // too. + if (UsedExt->getType() == ExtI->getType()) { + Worklist.push_back(UsedPHI); + continue; + } else if (ExtVT.getFixedSizeInBits() > + UsedVT.getFixedSizeInBits()) { + // The Ext following the other PHI Node is of a smaller + // size, check whether the trunc from the bigger to the + // smaller size is free, if it is, we can still go + // ahead with the transformation. + InstructionCost Cost = TTI.getCastInstrCost( + Instruction::Trunc, UsedExt->getType(), + ExtI->getType(), TTI::CastContextHint::None, + TTI::TCK_SizeAndLatency, NULL); + if (Cost.isValid() && Cost == 0) { + + Worklist.push_back(UsedPHI); + continue; + } + } + } + } + } + } + } + // If it reaches this point then it's a Use we can not promote for + // free. + Transform = false; + break; + } + if (!Transform) + break; + } + } else if (!isa(IncValue)) { + // If it's not an Instruction, nor a Constant, then it is probably + // an Argument value, for which Ext's aren't likely to be free. + Transform = false; + break; + } else if (isa(IncValue) && IncValue->hasOneUse()) + Worklist.push_back(cast(IncValue)); + } + SeenPHIs.insert(CurrentPHI); + } + + if (!Transform) + return false; + + // We have decided to go ahead with the transformation. + IRBuilder<> Builder{*Ctx}; + Builder.SetInsertPoint(PHI); + bool Sign = ExtI->getOpcode() == Instruction::SExt; + Type *ExtType = ExtI->getType(); + llvm::SmallMapVector TransformedPHIs; + // First replace all SeenPHIs with their promoted equivalent and replace the + // uses of the following Ext with the NewPHI. + for (auto OldPHI : SeenPHIs) { + Builder.SetInsertPoint(OldPHI); + PHINode *NewPHI = Builder.CreatePHI(ExtType, OldPHI->getNumOperands()); + TransformedPHIs.insert(std::pair(OldPHI, NewPHI)); + auto *UserI = cast(OldPHI->getUniqueUndroppableUser()); + assert(UserI && (UserI->getOpcode() == ExtI->getOpcode() || + UserI->getOpcode() == Instruction::PHI)); + // If the OldPHI was used by another PHI then we don't need to replace any + // value since that will be taken care by the following step where we + // create the incoming values for the NewPHI. + if (UserI->getOpcode() != Instruction::PHI) { + // The type of the use of the OldPHI is smaller than the NewPHI, so we + // need to truncate. + if (UserI->getType() != ExtType) { + Builder.SetInsertPoint(OldPHI->getParent(), + OldPHI->getParent()->getFirstInsertionPt()); + UserI->replaceAllUsesWith( + Builder.CreateTrunc(NewPHI, UserI->getType())); + } else + UserI->replaceAllUsesWith(NewPHI); + // Record the Ext for removal during the follow-up clean. + ToRemove.push_back(UserI); + } + Worklist.push_back(OldPHI); + } + + // Populate incoming values of all NewPHIs + while (!Worklist.empty()) { + PHINode *OldPHI = Worklist.pop_back_val(); + PHINode *NewPHI = TransformedPHIs[OldPHI]; + for (BasicBlock *BB : OldPHI->blocks()) { + Value *IncValue = OldPHI->getIncomingValueForBlock(BB); + if (isa(IncValue)) { + Constant *Cst = cast(IncValue); + Constant *NewCst = Sign ? ConstantExpr::getSExt(Cst, ExtType) + : ConstantExpr::getZExt(Cst, ExtType); + NewPHI->addIncoming(NewCst, BB); + } else if (isa(IncValue)) { + NewPHI->addIncoming(TransformedPHIs[cast(IncValue)], BB); + } else { + if (isa(IncValue)) { + // Make sure to insert the Ext after the Instruction. + auto *IncI = cast(IncValue); + Builder.SetInsertPoint(IncI->getParent(), ++IncI->getIterator()); + } else + // This should mean we are dealing with an argument value, so it + // should be OK to insert the Ext at the start of the incoming BB. + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + + Value *NewExt = Sign ? Builder.CreateSExt(IncValue, ExtType) + : Builder.CreateZExt(IncValue, ExtType); + NewPHI->addIncoming(NewExt, BB); + } + } + ToRemove.push_back(OldPHI); + } + return true; +} + bool TypePromotion::runOnFunction(Function &F) { if (skipFunction(F) || DisablePromotion) return false; @@ -926,54 +1165,52 @@ const TargetMachine &TM = TPC->getTM(); const TargetSubtargetInfo *SubtargetInfo = TM.getSubtargetImpl(F); const TargetLowering *TLI = SubtargetInfo->getTargetLowering(); - const TargetTransformInfo &TII = - getAnalysis().getTTI(F); + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); RegisterBitWidth = - TII.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedSize(); + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedSize(); Ctx = &F.getParent()->getContext(); - // Search up from icmps to try to promote their operands. + SmallVector ToRemove; + SmallPtrSet VisitedPHIs; for (BasicBlock &BB : F) { for (auto &I : BB) { - if (AllVisited.count(&I)) - continue; + if (isa(&I) && (I.getOpcode() == Instruction::ZExt || + I.getOpcode() == Instruction::SExt)) { + PHINode *PHI = dyn_cast(I.getOperand(0)); + if (PHI && !VisitedPHIs.contains(PHI) && PHI->hasOneUse()) + MadeChange |= TryToPromotePHI(cast(&I), + cast(I.getOperand(0)), TLI, DL, + TTI, ToRemove, VisitedPHIs); + } + } + } - if (!isa(&I)) - continue; + // Clean up no-longer used PHI-nodes and Ext instructions. + for (Instruction *I : ToRemove) + I->eraseFromParent(); - auto *ICmp = cast(&I); - // Skip signed or pointer compares - if (ICmp->isSigned() || - !isa(ICmp->getOperand(0)->getType())) + // Search up from icmps to try to promote their operands. + for (BasicBlock &BB : F) { + for (auto &I : BB) { + if (AllVisited.count(&I)) continue; - LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: " << *ICmp << "\n"); + if (isa(&I)) { + auto *ICmp = cast(&I); - for (auto &Op : ICmp->operands()) { - if (auto *I = dyn_cast(Op)) { - EVT SrcVT = TLI->getValueType(DL, I->getType()); - if (SrcVT.isSimple() && TLI->isTypeLegal(SrcVT.getSimpleVT())) - break; + // Skip signed or pointer compares + if (ICmp->isSigned() || + !isa(ICmp->getOperand(0)->getType())) + continue; - if (TLI->getTypeAction(ICmp->getContext(), SrcVT) != - TargetLowering::TypePromoteInteger) - break; - EVT PromotedVT = TLI->getTypeToTransformTo(ICmp->getContext(), SrcVT); - if (RegisterBitWidth < PromotedVT.getFixedSizeInBits()) { - LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target register " - << "for promoted type\n"); - break; - } - - MadeChange |= TryToPromote(I, PromotedVT.getFixedSizeInBits()); - break; - } + MadeChange |= TryToPromoteICmp(ICmp, TLI, DL); } + LLVM_DEBUG(if (verifyFunction(F, &dbgs())) { + dbgs() << F; + report_fatal_error("Broken function after type promotion"); + }); } - LLVM_DEBUG(if (verifyFunction(F, &dbgs())) { - dbgs() << F; - report_fatal_error("Broken function after type promotion"); - }); } if (MadeChange) LLVM_DEBUG(dbgs() << "After TypePromotion: " << F << "\n"); Index: llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll +++ llvm/test/CodeGen/Thumb2/mve-vmovlloop.ll @@ -155,20 +155,18 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: ldrsh.w r1, [sp, #8] -; CHECK-NEXT: vmov.i16 q0, #0x100 -; CHECK-NEXT: vldrb.u16 q1, [r2], #8 +; CHECK-NEXT: vmov.i16 q1, #0x100 +; CHECK-NEXT: vldrb.u16 q0, [r2], #8 ; CHECK-NEXT: vldrb.u16 q2, [r0], #8 ; CHECK-NEXT: ldr r3, [sp, #12] ; CHECK-NEXT: dlstp.16 lr, r1 ; CHECK-NEXT: .LBB3_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vsub.i16 q3, q0, q1 -; CHECK-NEXT: vmovlb.u8 q2, q2 +; CHECK-NEXT: vsub.i16 q3, q1, q0 ; CHECK-NEXT: vmul.i16 q3, q2, q3 ; CHECK-NEXT: vldrb.u16 q2, [r0], #8 -; CHECK-NEXT: vmla.u16 q3, q1, r3 -; CHECK-NEXT: vldrb.u16 q1, [r2], #8 +; CHECK-NEXT: vmla.u16 q3, q0, r3 +; CHECK-NEXT: vldrb.u16 q0, [r2], #8 ; CHECK-NEXT: vshr.u16 q3, q3, #8 ; CHECK-NEXT: vstrb.16 q3, [r0, #-16] ; CHECK-NEXT: letp lr, .LBB3_1 Index: llvm/test/Transforms/TypePromotion/AArch64/dont-promote-phi-ext.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TypePromotion/AArch64/dont-promote-phi-ext.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=aarch64 -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s + +; Function Attrs: nounwind uwtable +define dso_local i32 @f(i8* nocapture readonly %ip) local_unnamed_addr { +; CHECK-LABEL: @f( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[IP:%.*]], align 1 +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i8 [[TMP0]], 100 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TAG_0_IN8:%.*]] = phi i8 [ [[CALL:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TAG_0:%.*]] = zext i8 [[TAG_0_IN8]] to i64 +; CHECK-NEXT: [[CALL]] = tail call i8 @fn(i64 [[TAG_0]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[CALL]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[TAG_0_IN_LCSSA:%.*]] = phi i8 [ [[TMP0]], [[ENTRY]] ], [ [[CALL]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TAG_0_IN_LCSSA]] to i32 +; CHECK-NEXT: ret i32 [[CONV3]] +; +entry: + %0 = load i8, i8* %ip, align 1 + %cmp7 = icmp ult i8 %0, 100 + br i1 %cmp7, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %tag.0.in8 = phi i8 [ %call, %for.body ], [ %0, %entry ] + %tag.0 = zext i8 %tag.0.in8 to i64 + %call = tail call i8 @fn(i64 %tag.0) + %cmp = icmp ult i8 %call, 100 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + %tag.0.in.lcssa = phi i8 [ %0, %entry ], [ %call, %for.body ] + %conv3 = zext i8 %tag.0.in.lcssa to i32 + ret i32 %conv3 +} + +declare dso_local i8 @fn(i64) local_unnamed_addr Index: llvm/test/Transforms/TypePromotion/AArch64/promote-phi-ext.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TypePromotion/AArch64/promote-phi-ext.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=aarch64 -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s + +define dso_local i32 @foo(i8* %ip) local_unnamed_addr { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[IP:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i64 +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[IP]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4]] = zext i8 [[TMP3]] to i64 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[TMP3]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]] +; CHECK: do.end: +; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: ret i32 [[CONV3]] +; +entry: + %0 = load i8, i8* %ip, align 1 + br label %do.body + +do.body: ; preds = %do.body, %entry + %.in = phi i8 [ %0, %entry ], [ %2, %do.body ] + %1 = zext i8 %.in to i64 + %arrayidx = getelementptr inbounds i8, i8* %ip, i64 %1 + %2 = load i8, i8* %arrayidx, align 1 + %cmp = icmp ult i8 %2, 100 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + %conv3 = zext i8 %2 to i32 + ret i32 %conv3 +} Index: llvm/test/Transforms/TypePromotion/ARM/dont-promote-phi-ext.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TypePromotion/ARM/dont-promote-phi-ext.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s + +define dso_local i64 @foo(i8* %ip) local_unnamed_addr { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[IP:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[DOTIN:%.*]] = phi i32 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[DOTIN]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = call i8* @get_addr(i64 [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP5]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP5]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]] +; CHECK: do.end: +; CHECK-NEXT: ret i64 [[TMP3]] +; +entry: + %0 = load i8, i8* %ip, align 1 + br label %do.body + +do.body: ; preds = %do.body, %entry + %.in = phi i8 [ %0, %entry ], [ %2, %do.body ] + %1 = zext i8 %.in to i64 + %arrayidx = call i8* @get_addr(i64 %1) + %2 = load i8, i8* %arrayidx, align 1 + %cmp = icmp ult i8 %2, 100 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret i64 %1 +} + +declare dso_local i8* @get_addr(i64) local_unnamed_addr + Index: llvm/test/Transforms/TypePromotion/ARM/promote-phi-ext.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TypePromotion/ARM/promote-phi-ext.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s + +define dso_local i16 @foo(i8* %ip) local_unnamed_addr { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[IP:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP4:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[IP]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP4]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[TMP3]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]] +; CHECK: do.end: +; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP3]] to i16 +; CHECK-NEXT: ret i16 [[CONV3]] +; +entry: + %0 = load i8, i8* %ip, align 1 + br label %do.body + +do.body: ; preds = %do.body, %entry + %.in = phi i8 [ %0, %entry ], [ %2, %do.body ] + %1 = zext i8 %.in to i32 + %arrayidx = getelementptr inbounds i8, i8* %ip, i32 %1 + %2 = load i8, i8* %arrayidx, align 1 + %cmp = icmp ult i8 %2, 100 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + %conv3 = zext i8 %2 to i16 + ret i16 %conv3 +} + Index: llvm/test/Transforms/TypePromotion/ARM/promote_mve_phi.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TypePromotion/ARM/promote_mve_phi.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -type-promotion -S -mattr=+mve | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-none-unknown-eabihf" + +define dso_local void @foo(i8* %a, i8* %b, i32 %n) local_unnamed_addr { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[A:%.*]] to <8 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[B:%.*]] to <8 x i8>* +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16> +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 8 +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i16> [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP15:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[N_ADDR_0:%.*]] = phi i32 [ [[N:%.*]], [[ENTRY]] ], [ [[SUB:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[B_ADDR_0:%.*]] = phi i8* [ [[ADD_PTR]], [[ENTRY]] ], [ [[ADD_PTR2:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[A_ADDR_0:%.*]] = phi i8* [ [[A]], [[ENTRY]] ], [ [[ADD_PTR1:%.*]], [[DO_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[N_ADDR_0]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 +; CHECK-NEXT: [[TMP10:%.*]] = zext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[TMP6]], <8 x i16> [[TMP5]], <8 x i1> [[TMP11]], <8 x i16> undef) +; CHECK-NEXT: [[ADD_PTR1]] = getelementptr inbounds i8, i8* [[A_ADDR_0]], i32 8 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[ADD_PTR1]] to <8 x i8>* +; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP13]], i32 1, <8 x i1> [[TMP11]], <8 x i8> zeroinitializer) +; CHECK-NEXT: [[TMP15]] = zext <8 x i8> [[TMP14]] to <8 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i16> [[TMP12]] to <8 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[A_ADDR_0]] to <8 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP17]], <8 x i8>* [[TMP18]], i32 1, <8 x i1> [[TMP11]]) +; CHECK-NEXT: [[ADD_PTR2]] = getelementptr inbounds i8, i8* [[B_ADDR_0]], i32 8 +; CHECK-NEXT: [[SUB]] = sub nsw i32 [[N_ADDR_0]], 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[SUB]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]] +; CHECK: do.end: +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast i8* %a to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %b to <8 x i8>* + %3 = load <8 x i8>, <8 x i8>* %2, align 1 + %4 = zext <8 x i8> %3 to <8 x i16> + %add.ptr = getelementptr inbounds i8, i8* %b, i32 8 + br label %do.body + +do.body: ; preds = %do.body, %entry + %v0.0 = phi <8 x i8> [ %1, %entry ], [ %12, %do.body ] + %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ] + %b.addr.0 = phi i8* [ %add.ptr, %entry ], [ %add.ptr2, %do.body ] + %a.addr.0 = phi i8* [ %a, %entry ], [ %add.ptr1, %do.body ] + %conv = zext <8 x i8> %v0.0 to <8 x i16> + %5 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n.addr.0) + %6 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %5) + %7 = trunc i32 %6 to i16 + %8 = zext i16 %7 to i32 + %9 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %8) + %10 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %conv, <8 x i16> %4, <8 x i1> %9, <8 x i16> undef) + %add.ptr1 = getelementptr inbounds i8, i8* %a.addr.0, i32 8 + %11 = bitcast i8* %add.ptr1 to <8 x i8>* + %12 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %11, i32 1, <8 x i1> %9, <8 x i8> zeroinitializer) + %13 = zext <8 x i8> %12 to <8 x i16> + %14 = trunc <8 x i16> %10 to <8 x i8> + %15 = bitcast i8* %a.addr.0 to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %14, <8 x i8>* %15, i32 1, <8 x i1> %9) + %add.ptr2 = getelementptr inbounds i8, i8* %b.addr.0, i32 8 + %sub = sub nsw i32 %n.addr.0, 8 + %cmp = icmp sgt i32 %sub, 0 + br i1 %cmp, label %do.body, label %do.end + +do.end: ; preds = %do.body + ret void +} + +; Function Attrs: nofree nosync nounwind readnone +declare <8 x i1> @llvm.arm.mve.vctp16(i32) + +; Function Attrs: nofree nosync nounwind readnone +declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>) + +; Function Attrs: nofree nosync nounwind readnone +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +; Function Attrs: nofree nosync nounwind readnone +declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) + +; Function Attrs: argmemonly mustprogress nofree nosync nounwind readonly willreturn +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) + +; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn writeonly +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>)