diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -202,9 +202,10 @@ Function *F, std::function *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB, - AssumptionCache *AC) + AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT), - GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {} + GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC), + BFI(BFI), PSI(PSI) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -478,6 +479,10 @@ /// Assume instructions in predicated blocks must be dropped if the CFG gets /// flattened. SmallPtrSet ConditionalAssumes; + + /// BFI and PSI are used to check for profile guided size optimizations. + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; }; } // namespace llvm diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3310,7 +3310,7 @@ // Check to see if this FP immediate is already legal. // If this is a legal constant, turn it into a TargetConstantFP node. if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0), - DAG.getMachineFunction().getFunction().hasOptSize())) + DAG.shouldOptForSize())) Results.push_back(ExpandConstantFP(CFP, true)); break; } diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp --- a/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -16,8 +16,11 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/Debug.h" @@ -111,6 +114,12 @@ MachineFunctionProperties::Property::NoVRegs); } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + private: TargetSchedModel TSM; const X86InstrInfo *TII = nullptr; @@ -205,21 +214,27 @@ TSM.init(&ST); TII = ST.getInstrInfo(); TRI = ST.getRegisterInfo(); + auto *PSI = &getAnalysis().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) + ? &getAnalysis().getBFI() + : nullptr; LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";); for (MachineBasicBlock &MBB : MF) { // First pass. Try to remove or optimize existing LEAs. + bool OptIncDecPerBB = + OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { if (!isLEA(I->getOpcode())) continue; - if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) + if (optTwoAddrLEA(I, MBB, OptIncDecPerBB, UseLEAForSP)) continue; if (IsSlowLEA) processInstructionForSlowLEA(I, MBB); else if (IsSlow3OpsLEA) - processInstrForSlow3OpLEA(I, MBB, OptIncDec); + processInstrForSlow3OpLEA(I, MBB, OptIncDecPerBB); } // Second pass for creating LEAs. This may reverse some of the diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4526,7 +4526,7 @@ // the patterns on the add/sub/and/or/xor with immediate paterns in the // tablegen files to check immediate use count without making the patterns // unavailable to the fast-isel table. - if (!OptForSize) + if (!CurDAG->shouldOptForSize()) break; // Only handle i8/i16/i32/i64. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34378,7 +34378,7 @@ return DAG.getBitcast(RootVT, V1); } - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; @@ -39218,7 +39218,7 @@ } // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!Subtarget.hasFastHorizontalOps() && !OptForSize) return SDValue(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" using namespace llvm; @@ -412,7 +413,11 @@ const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); - bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize(); + Function *F = TheLoop->getHeader()->getParent(); + bool OptForSize = F->hasOptSize() || + llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + bool CanAddPredicate = !OptForSize; int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false); if (Stride == 1 || Stride == -1) return Stride; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -395,11 +395,13 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), - VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} + VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), + BFI(BFI), PSI(PSI) {} virtual ~InnerLoopVectorizer() = default; /// Create a new empty loop. Unlink the old loop and connect the new one. @@ -779,6 +781,10 @@ // Vector of original scalar PHIs whose corresponding widened PHIs need to be // fixed up at the end of vector code generation. SmallVector OrigPHIsToFix; + + /// BFI and PSI are used to check for profile guided size optimizations. + BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -789,9 +795,10 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM) {} + UnrollFactor, LVL, CM, BFI, PSI) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -2754,7 +2761,9 @@ if (C->isZero()) return; - assert(!SCEVCheckBlock->getParent()->hasOptSize() && + assert(!(SCEVCheckBlock->getParent()->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass)) && "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); @@ -2800,7 +2809,9 @@ assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " "claimed checks are required"); - if (MemCheckBlock->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize() || + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass)) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -7716,7 +7727,7 @@ LVP.setBestPlan(VF.Width, 1); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM); + &CM, BFI, PSI); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -7780,7 +7791,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, - &Requirements, &Hints, DB, AC); + &Requirements, &Hints, DB, AC, BFI, PSI); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); @@ -7980,8 +7991,8 @@ assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, + BFI, PSI); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -7993,7 +8004,7 @@ } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); + &LVL, &CM, BFI, PSI); LVP.executePlan(LB, DT); ++LoopsVectorized; diff --git a/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll b/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll --- a/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fp-imm-size.ll @@ -38,3 +38,38 @@ ; CHECK-NEXT: ret ret fp128 0xL00000000000000000000000000000000 } + +; CHECK: literal8 +; CHECK: .quad 0x0000001fffffffd +define double @foo2_pgso() !prof !14 { +; CHECK: _foo2_pgso: +; CHECK: adrp x[[REG:[0-9]+]], lCPI4_0@PAGE +; CHECK: ldr d0, [x[[REG]], lCPI4_0@PAGEOFF] +; CHECK-NEXT: ret + ret double 0x1FFFFFFFd1 +} + +define float @bar_pgso() !prof !14 { +; CHECK: _bar_pgso: +; CHECK: adrp x[[REG:[0-9]+]], lCPI5_0@PAGE +; CHECK: ldr s0, [x[[REG]], lCPI5_0@PAGEOFF] +; CHECK-NEXT: ret + ret float 0x400921FB80000000 +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll --- a/llvm/test/CodeGen/X86/fixup-lea.ll +++ b/llvm/test/CodeGen/X86/fixup-lea.ll @@ -109,31 +109,18 @@ } define void @foo_pgso(i32 inreg %dns) !prof !14 { -; SLOW-LABEL: foo_pgso: -; SLOW: # %bb.0: # %entry -; SLOW-NEXT: xorl %ecx, %ecx -; SLOW-NEXT: decl %ecx -; SLOW-NEXT: .LBB4_1: # %for.body -; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 -; SLOW-NEXT: movzwl %cx, %edx -; SLOW-NEXT: decl %ecx -; SLOW-NEXT: cmpl %eax, %edx -; SLOW-NEXT: jl .LBB4_1 -; SLOW-NEXT: # %bb.2: # %for.end -; SLOW-NEXT: retl -; -; FAST-LABEL: foo_pgso: -; FAST: # %bb.0: # %entry -; FAST-NEXT: xorl %ecx, %ecx -; FAST-NEXT: decl %ecx -; FAST-NEXT: .LBB4_1: # %for.body -; FAST-NEXT: # =>This Inner Loop Header: Depth=1 -; FAST-NEXT: movzwl %cx, %edx -; FAST-NEXT: addl $-1, %ecx -; FAST-NEXT: cmpl %eax, %edx -; FAST-NEXT: jl .LBB4_1 -; FAST-NEXT: # %bb.2: # %for.end -; FAST-NEXT: retl +; CHECK-LABEL: foo_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: decl %ecx +; CHECK-NEXT: .LBB4_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movzwl %cx, %edx +; CHECK-NEXT: decl %ecx +; CHECK-NEXT: cmpl %eax, %edx +; CHECK-NEXT: jl .LBB4_1 +; CHECK-NEXT: # %bb.2: # %for.end +; CHECK-NEXT: retl entry: br label %for.body @@ -149,31 +136,18 @@ } define void @bar_pgso(i32 inreg %dns) !prof !14 { -; SLOW-LABEL: bar_pgso: -; SLOW: # %bb.0: # %entry -; SLOW-NEXT: xorl %ecx, %ecx -; SLOW-NEXT: incl %ecx -; SLOW-NEXT: .LBB5_1: # %for.body -; SLOW-NEXT: # =>This Inner Loop Header: Depth=1 -; SLOW-NEXT: movzwl %cx, %edx -; SLOW-NEXT: incl %ecx -; SLOW-NEXT: cmpl %eax, %edx -; SLOW-NEXT: jl .LBB5_1 -; SLOW-NEXT: # %bb.2: # %for.end -; SLOW-NEXT: retl -; -; FAST-LABEL: bar_pgso: -; FAST: # %bb.0: # %entry -; FAST-NEXT: xorl %ecx, %ecx -; FAST-NEXT: incl %ecx -; FAST-NEXT: .LBB5_1: # %for.body -; FAST-NEXT: # =>This Inner Loop Header: Depth=1 -; FAST-NEXT: movzwl %cx, %edx -; FAST-NEXT: addl $1, %ecx -; FAST-NEXT: cmpl %eax, %edx -; FAST-NEXT: jl .LBB5_1 -; FAST-NEXT: # %bb.2: # %for.end -; FAST-NEXT: retl +; CHECK-LABEL: bar_pgso: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: incl %ecx +; CHECK-NEXT: .LBB5_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movzwl %cx, %edx +; CHECK-NEXT: incl %ecx +; CHECK-NEXT: cmpl %eax, %edx +; CHECK-NEXT: jl .LBB5_1 +; CHECK-NEXT: # %bb.2: # %for.end +; CHECK-NEXT: retl entry: br label %for.body diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -184,6 +184,7 @@ ; CHECK-NEXT: X86 Byte/Word Instruction Fixup ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions +; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible ; CHECK-NEXT: X86 Discriminate Memory Operands diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -2094,6 +2094,28 @@ ret i32 %x230 } +define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 { +; SSE3-LABEL: hadd32_4_pgso: +; SSE3: # %bb.0: +; SSE3-NEXT: phaddd %xmm0, %xmm0 +; SSE3-NEXT: phaddd %xmm0, %xmm0 +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: retq +; +; AVX-LABEL: hadd32_4_pgso: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq + %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> + %x227 = add <4 x i32> %x225, %x226 + %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> + %x229 = add <4 x i32> %x227, %x228 + %x230 = extractelement <4 x i32> %x229, i32 0 + ret i32 %x230 +} + define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_8_optsize: ; SSE3: # %bb.0: @@ -2141,3 +2163,20 @@ %x230 = extractelement <16 x i32> %x229, i32 0 ret i32 %x230 } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -1034,8 +1034,454 @@ ret i128 %cnt } +define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 { +; X32-LABEL: cnt32_pgso: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl %ecx +; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X32-NEXT: subl %ecx, %eax +; X32-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X32-NEXT: movl %eax, %edx +; X32-NEXT: andl %ecx, %edx +; X32-NEXT: shrl $2, %eax +; X32-NEXT: andl %ecx, %eax +; X32-NEXT: addl %edx, %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: shrl $4, %ecx +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 +; X32-NEXT: shrl $24, %eax +; X32-NEXT: retl +; +; X64-LABEL: cnt32_pgso: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: subl %eax, %edi +; X64-NEXT: movl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: andl %eax, %ecx +; X64-NEXT: shrl $2, %edi +; X64-NEXT: andl %eax, %edi +; X64-NEXT: addl %ecx, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $4, %eax +; X64-NEXT: addl %edi, %eax +; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X64-NEXT: shrl $24, %eax +; X64-NEXT: retq +; +; X32-POPCNT-LABEL: cnt32_pgso: +; X32-POPCNT: # %bb.0: +; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X32-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: cnt32_pgso: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntl %edi, %eax +; X64-POPCNT-NEXT: retq + %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) + ret i32 %cnt +} + +define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 { +; X32-NOSSE-LABEL: cnt64_pgso: +; X32-NOSSE: # %bb.0: +; X32-NOSSE-NEXT: pushl %ebx +; X32-NOSSE-NEXT: pushl %edi +; X32-NOSSE-NEXT: pushl %esi +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOSSE-NEXT: movl %ecx, %edx +; X32-NOSSE-NEXT: shrl %edx +; X32-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X32-NOSSE-NEXT: andl %esi, %edx +; X32-NOSSE-NEXT: subl %edx, %ecx +; X32-NOSSE-NEXT: movl $858993459, %edx # imm = 0x33333333 +; X32-NOSSE-NEXT: movl %ecx, %edi +; X32-NOSSE-NEXT: andl %edx, %edi +; X32-NOSSE-NEXT: shrl $2, %ecx +; X32-NOSSE-NEXT: andl %edx, %ecx +; X32-NOSSE-NEXT: addl %edi, %ecx +; X32-NOSSE-NEXT: movl %ecx, %edi +; X32-NOSSE-NEXT: shrl $4, %edi +; X32-NOSSE-NEXT: addl %ecx, %edi +; X32-NOSSE-NEXT: movl $252645135, %ecx # imm = 0xF0F0F0F +; X32-NOSSE-NEXT: andl %ecx, %edi +; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %edi +; X32-NOSSE-NEXT: movl %eax, %ebx +; X32-NOSSE-NEXT: shrl %ebx +; X32-NOSSE-NEXT: andl %esi, %ebx +; X32-NOSSE-NEXT: subl %ebx, %eax +; X32-NOSSE-NEXT: movl %eax, %esi +; X32-NOSSE-NEXT: andl %edx, %esi +; X32-NOSSE-NEXT: shrl $2, %eax +; X32-NOSSE-NEXT: andl %edx, %eax +; X32-NOSSE-NEXT: addl %esi, %eax +; X32-NOSSE-NEXT: movl %eax, %edx +; X32-NOSSE-NEXT: shrl $4, %edx +; X32-NOSSE-NEXT: addl %eax, %edx +; X32-NOSSE-NEXT: andl %ecx, %edx +; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %eax +; X32-NOSSE-NEXT: addl %edi, %eax +; X32-NOSSE-NEXT: xorl %edx, %edx +; X32-NOSSE-NEXT: popl %esi +; X32-NOSSE-NEXT: popl %edi +; X32-NOSSE-NEXT: popl %ebx +; X32-NOSSE-NEXT: retl +; +; X64-LABEL: cnt64_pgso: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: subq %rcx, %rdi +; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NEXT: movq %rdi, %rcx +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: shrq $2, %rdi +; X64-NEXT: andq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq $4, %rax +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: shrq $56, %rax +; X64-NEXT: retq +; +; X32-POPCNT-LABEL: cnt64_pgso: +; X32-POPCNT: # %bb.0: +; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax +; X32-POPCNT-NEXT: addl %ecx, %eax +; X32-POPCNT-NEXT: xorl %edx, %edx +; X32-POPCNT-NEXT: retl +; +; X64-POPCNT-LABEL: cnt64_pgso: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntq %rdi, %rax +; X64-POPCNT-NEXT: retq +; +; X32-SSE2-LABEL: cnt64_pgso: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrlw $1, %xmm1 +; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE2-NEXT: psubb %xmm1, %xmm0 +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: pand %xmm1, %xmm2 +; X32-SSE2-NEXT: psrlw $2, %xmm0 +; X32-SSE2-NEXT: pand %xmm1, %xmm0 +; X32-SSE2-NEXT: paddb %xmm2, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrlw $4, %xmm1 +; X32-SSE2-NEXT: paddb %xmm0, %xmm1 +; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE2-NEXT: pxor %xmm0, %xmm0 +; X32-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X32-SSE2-NEXT: movd %xmm0, %eax +; X32-SSE2-NEXT: xorl %edx, %edx +; X32-SSE2-NEXT: retl +; +; X32-SSSE3-LABEL: cnt64_pgso: +; X32-SSSE3: # %bb.0: +; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X32-SSSE3-NEXT: pand %xmm0, %xmm2 +; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X32-SSSE3-NEXT: psrlw $4, %xmm1 +; X32-SSSE3-NEXT: pand %xmm0, %xmm1 +; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3 +; X32-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X32-SSSE3-NEXT: pxor %xmm0, %xmm0 +; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0 +; X32-SSSE3-NEXT: movd %xmm0, %eax +; X32-SSSE3-NEXT: xorl %edx, %edx +; X32-SSSE3-NEXT: retl + %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) + ret i64 %cnt +} + +define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { +; X32-NOSSE-LABEL: cnt128_pgso: +; X32-NOSSE: # %bb.0: +; X32-NOSSE-NEXT: pushl %ebp +; X32-NOSSE-NEXT: pushl %ebx +; X32-NOSSE-NEXT: pushl %edi +; X32-NOSSE-NEXT: pushl %esi +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NOSSE-NEXT: movl %ebx, %ecx +; X32-NOSSE-NEXT: shrl %ecx +; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X32-NOSSE-NEXT: andl %edi, %ecx +; X32-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X32-NOSSE-NEXT: subl %ecx, %ebx +; X32-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X32-NOSSE-NEXT: movl %ebx, %ebp +; X32-NOSSE-NEXT: andl %ecx, %ebp +; X32-NOSSE-NEXT: shrl $2, %ebx +; X32-NOSSE-NEXT: andl %ecx, %ebx +; X32-NOSSE-NEXT: addl %ebp, %ebx +; X32-NOSSE-NEXT: movl %ebx, %ebp +; X32-NOSSE-NEXT: shrl $4, %ebp +; X32-NOSSE-NEXT: addl %ebx, %ebp +; X32-NOSSE-NEXT: movl %eax, %ebx +; X32-NOSSE-NEXT: shrl %ebx +; X32-NOSSE-NEXT: andl %edi, %ebx +; X32-NOSSE-NEXT: subl %ebx, %eax +; X32-NOSSE-NEXT: movl %eax, %ebx +; X32-NOSSE-NEXT: andl %ecx, %ebx +; X32-NOSSE-NEXT: shrl $2, %eax +; X32-NOSSE-NEXT: andl %ecx, %eax +; X32-NOSSE-NEXT: addl %ebx, %eax +; X32-NOSSE-NEXT: movl %eax, %edi +; X32-NOSSE-NEXT: shrl $4, %edi +; X32-NOSSE-NEXT: addl %eax, %edi +; X32-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F +; X32-NOSSE-NEXT: andl %ebx, %ebp +; X32-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %eax +; X32-NOSSE-NEXT: andl %ebx, %edi +; X32-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %edi +; X32-NOSSE-NEXT: addl %eax, %edi +; X32-NOSSE-NEXT: movl %esi, %eax +; X32-NOSSE-NEXT: shrl %eax +; X32-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 +; X32-NOSSE-NEXT: andl %ebp, %eax +; X32-NOSSE-NEXT: subl %eax, %esi +; X32-NOSSE-NEXT: movl %esi, %eax +; X32-NOSSE-NEXT: andl %ecx, %eax +; X32-NOSSE-NEXT: shrl $2, %esi +; X32-NOSSE-NEXT: andl %ecx, %esi +; X32-NOSSE-NEXT: addl %eax, %esi +; X32-NOSSE-NEXT: movl %esi, %eax +; X32-NOSSE-NEXT: shrl $4, %eax +; X32-NOSSE-NEXT: addl %esi, %eax +; X32-NOSSE-NEXT: movl %edx, %esi +; X32-NOSSE-NEXT: shrl %esi +; X32-NOSSE-NEXT: andl %ebp, %esi +; X32-NOSSE-NEXT: subl %esi, %edx +; X32-NOSSE-NEXT: movl %edx, %esi +; X32-NOSSE-NEXT: andl %ecx, %esi +; X32-NOSSE-NEXT: shrl $2, %edx +; X32-NOSSE-NEXT: andl %ecx, %edx +; X32-NOSSE-NEXT: addl %esi, %edx +; X32-NOSSE-NEXT: movl %edx, %ecx +; X32-NOSSE-NEXT: shrl $4, %ecx +; X32-NOSSE-NEXT: addl %edx, %ecx +; X32-NOSSE-NEXT: andl %ebx, %eax +; X32-NOSSE-NEXT: andl %ebx, %ecx +; X32-NOSSE-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %eax +; X32-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %ecx +; X32-NOSSE-NEXT: addl %eax, %ecx +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOSSE-NEXT: addl %edi, %ecx +; X32-NOSSE-NEXT: xorl %edx, %edx +; X32-NOSSE-NEXT: movl %edx, 12(%eax) +; X32-NOSSE-NEXT: movl %edx, 8(%eax) +; X32-NOSSE-NEXT: movl %edx, 4(%eax) +; X32-NOSSE-NEXT: movl %ecx, (%eax) +; X32-NOSSE-NEXT: popl %esi +; X32-NOSSE-NEXT: popl %edi +; X32-NOSSE-NEXT: popl %ebx +; X32-NOSSE-NEXT: popl %ebp +; X32-NOSSE-NEXT: retl $4 +; +; X64-LABEL: cnt128_pgso: +; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 +; X64-NEXT: andq %r8, %rax +; X64-NEXT: subq %rax, %rsi +; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: andq %rax, %rsi +; X64-NEXT: addq %rcx, %rsi +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: shrq $4, %rcx +; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: andq %r9, %rcx +; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101 +; X64-NEXT: imulq %rdx, %rcx +; X64-NEXT: shrq $56, %rcx +; X64-NEXT: movq %rdi, %rsi +; X64-NEXT: shrq %rsi +; X64-NEXT: andq %r8, %rsi +; X64-NEXT: subq %rsi, %rdi +; X64-NEXT: movq %rdi, %rsi +; X64-NEXT: andq %rax, %rsi +; X64-NEXT: shrq $2, %rdi +; X64-NEXT: andq %rax, %rdi +; X64-NEXT: addq %rsi, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrq $4, %rax +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: andq %r9, %rax +; X64-NEXT: imulq %rdx, %rax +; X64-NEXT: shrq $56, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: retq +; +; X32-POPCNT-LABEL: cnt128_pgso: +; X32-POPCNT: # %bb.0: +; X32-POPCNT-NEXT: pushl %esi +; X32-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx +; X32-POPCNT-NEXT: addl %ecx, %edx +; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx +; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X32-POPCNT-NEXT: addl %ecx, %esi +; X32-POPCNT-NEXT: addl %edx, %esi +; X32-POPCNT-NEXT: xorl %ecx, %ecx +; X32-POPCNT-NEXT: movl %ecx, 12(%eax) +; X32-POPCNT-NEXT: movl %ecx, 8(%eax) +; X32-POPCNT-NEXT: movl %ecx, 4(%eax) +; X32-POPCNT-NEXT: movl %esi, (%eax) +; X32-POPCNT-NEXT: popl %esi +; X32-POPCNT-NEXT: retl $4 +; +; X64-POPCNT-LABEL: cnt128_pgso: +; X64-POPCNT: # %bb.0: +; X64-POPCNT-NEXT: popcntq %rsi, %rcx +; X64-POPCNT-NEXT: popcntq %rdi, %rax +; X64-POPCNT-NEXT: addq %rcx, %rax +; X64-POPCNT-NEXT: xorl %edx, %edx +; X64-POPCNT-NEXT: retq +; +; X32-SSE2-LABEL: cnt128_pgso: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrlw $1, %xmm1 +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] +; X32-SSE2-NEXT: pand %xmm2, %xmm1 +; X32-SSE2-NEXT: psubb %xmm1, %xmm0 +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE2-NEXT: pand %xmm1, %xmm3 +; X32-SSE2-NEXT: psrlw $2, %xmm0 +; X32-SSE2-NEXT: pand %xmm1, %xmm0 +; X32-SSE2-NEXT: paddb %xmm3, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE2-NEXT: psrlw $4, %xmm3 +; X32-SSE2-NEXT: paddb %xmm0, %xmm3 +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE2-NEXT: pand %xmm0, %xmm3 +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: psadbw %xmm4, %xmm3 +; X32-SSE2-NEXT: movd %xmm3, %ecx +; X32-SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; X32-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X32-SSE2-NEXT: psrlw $1, %xmm5 +; X32-SSE2-NEXT: pand %xmm2, %xmm5 +; X32-SSE2-NEXT: psubb %xmm5, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X32-SSE2-NEXT: pand %xmm1, %xmm2 +; X32-SSE2-NEXT: psrlw $2, %xmm3 +; X32-SSE2-NEXT: pand %xmm1, %xmm3 +; X32-SSE2-NEXT: paddb %xmm2, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X32-SSE2-NEXT: psrlw $4, %xmm1 +; X32-SSE2-NEXT: paddb %xmm3, %xmm1 +; X32-SSE2-NEXT: pand %xmm0, %xmm1 +; X32-SSE2-NEXT: psadbw %xmm4, %xmm1 +; X32-SSE2-NEXT: movd %xmm1, %edx +; X32-SSE2-NEXT: addl %ecx, %edx +; X32-SSE2-NEXT: xorl %ecx, %ecx +; X32-SSE2-NEXT: movl %ecx, 12(%eax) +; X32-SSE2-NEXT: movl %ecx, 8(%eax) +; X32-SSE2-NEXT: movl %ecx, 4(%eax) +; X32-SSE2-NEXT: movl %edx, (%eax) +; X32-SSE2-NEXT: retl $4 +; +; X32-SSSE3-LABEL: cnt128_pgso: +; X32-SSSE3: # %bb.0: +; X32-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X32-SSSE3-NEXT: pand %xmm0, %xmm2 +; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X32-SSSE3-NEXT: psrlw $4, %xmm1 +; X32-SSSE3-NEXT: pand %xmm0, %xmm1 +; X32-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; X32-SSSE3-NEXT: pshufb %xmm1, %xmm2 +; X32-SSSE3-NEXT: paddb %xmm4, %xmm2 +; X32-SSSE3-NEXT: pxor %xmm1, %xmm1 +; X32-SSSE3-NEXT: psadbw %xmm1, %xmm2 +; X32-SSSE3-NEXT: movd %xmm2, %ecx +; X32-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X32-SSSE3-NEXT: movdqa %xmm2, %xmm4 +; X32-SSSE3-NEXT: pand %xmm0, %xmm4 +; X32-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; X32-SSSE3-NEXT: pshufb %xmm4, %xmm5 +; X32-SSSE3-NEXT: psrlw $4, %xmm2 +; X32-SSSE3-NEXT: pand %xmm0, %xmm2 +; X32-SSSE3-NEXT: pshufb %xmm2, %xmm3 +; X32-SSSE3-NEXT: paddb %xmm5, %xmm3 +; X32-SSSE3-NEXT: psadbw %xmm1, %xmm3 +; X32-SSSE3-NEXT: movd %xmm3, %edx +; X32-SSSE3-NEXT: addl %ecx, %edx +; X32-SSSE3-NEXT: xorl %ecx, %ecx +; X32-SSSE3-NEXT: movl %ecx, 12(%eax) +; X32-SSSE3-NEXT: movl %ecx, 8(%eax) +; X32-SSSE3-NEXT: movl %ecx, 4(%eax) +; X32-SSSE3-NEXT: movl %edx, (%eax) +; X32-SSSE3-NEXT: retl $4 + %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) + ret i128 %cnt +} + declare i8 @llvm.ctpop.i8(i8) nounwind readnone declare i16 @llvm.ctpop.i16(i16) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i64 @llvm.ctpop.i64(i64) nounwind readnone declare i128 @llvm.ctpop.i128(i128) nounwind readnone + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/pr27202.ll b/llvm/test/CodeGen/X86/pr27202.ll --- a/llvm/test/CodeGen/X86/pr27202.ll +++ b/llvm/test/CodeGen/X86/pr27202.ll @@ -14,6 +14,19 @@ ret i1 %cmp } +define i1 @foo_pgso(i32 %i) !prof !14 { +; CHECK-LABEL: foo_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: movl $305419896, %eax # imm = 0x12345678 +; CHECK-NEXT: andl %eax, %edi +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq + %and = and i32 %i, 305419896 + %cmp = icmp eq i32 %and, 305419896 + ret i1 %cmp +} + ; 8-bit ALU immediates probably have small encodings. ; We do not want to hoist the constant into a register here. @@ -52,3 +65,20 @@ %or4 = or i64 %or, %shl ret i64 %or4 } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -417,6 +417,33 @@ ret <8 x i64> %shuffle } +define <8 x i64> @pr23259_pgso() !prof !14 { +; AVX-LABEL: pr23259_pgso: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX-NEXT: retq +; +; AVX2-LABEL: pr23259_pgso: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,1] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: retq +entry: + %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32 + %1 = shufflevector <4 x i64> %0, <4 x i64> undef, <3 x i32> + %shuffle = shufflevector <3 x i64> , <3 x i64> %1, <8 x i32> + ret <8 x i64> %shuffle +} + attributes #0 = { optsize } attributes #1 = { minsize } diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -121,6 +121,38 @@ br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 } +define void @pr43371_pgso() !prof !14 { +; +; CHECK-LABEL: @pr43371_pgso +; CHECK-NOT: vector.scevcheck +; +; We do not want to generate SCEV predicates when optimising for size, because +; that will lead to extra code generation such as the SCEV overflow runtime +; checks. Not generating SCEV predicates can still result in vectorisation as +; the non-consecutive loads/stores can be scalarized: +; +; CHECK: vector.body: +; CHECK: store i16 0, i16* %{{.*}}, align 1 +; CHECK: store i16 0, i16* %{{.*}}, align 1 +; CHECK: br i1 {{.*}}, label %vector.body +; +entry: + br label %for.body29 + +for.cond.cleanup28: + unreachable + +for.body29: + %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] + %add33 = add i16 undef, %i24.0170 + %idxprom34 = zext i16 %add33 to i32 + %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34 + store i16 0, i16 * %arrayidx35, align 1 + %inc37 = add i16 %i24.0170, 1 + %cmp26 = icmp ult i16 %inc37, 756 + br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 +} + ; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out. ; define i32 @pr45526() optsize { @@ -154,6 +186,37 @@ ret i32 %for } +define i32 @pr45526_pgso() !prof !14 { +; +; CHECK-LABEL: @pr45526_pgso +; CHECK-NEXT: entry: +; CHECK-NEXT: br label %loop +; CHECK-EMPTY: +; CHECK-NEXT: loop: +; CHECK-NEXT: %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] +; CHECK-NEXT: %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] +; CHECK-NEXT: %pivPlus1 = add nuw nsw i32 %piv, 1 +; CHECK-NEXT: %cond = icmp ult i32 %piv, 510 +; CHECK-NEXT: br i1 %cond, label %loop, label %exit +; CHECK-EMPTY: +; CHECK-NEXT: exit: +; CHECK-NEXT: %for.lcssa = phi i32 [ %for, %loop ] +; CHECK-NEXT: ret i32 %for.lcssa +; +entry: + br label %loop + +loop: + %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ] + %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ] + %pivPlus1 = add nuw nsw i32 %piv, 1 + %cond = icmp ult i32 %piv, 510 + br i1 %cond, label %loop, label %exit + +exit: + ret i32 %for +} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"ProfileSummary", !1} !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}