diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -203,6 +203,22 @@ SubtargetFeature<"fuse-logical", "HasLogicalFusion", "true", "Target supports Logical Operations fusion", [FeatureFusion]>; +def FeatureSha3Fusion : + SubtargetFeature<"fuse-sha3", "HasSha3Fusion", "true", + "Target supports SHA3 assist fusion", + [FeatureFusion]>; +def FeatureCompareFusion: + SubtargetFeature<"fuse-cmp", "HasCompareFusion", "true", + "Target supports Comparison Operations fusion", + [FeatureFusion]>; +def FeatureWideImmFusion: + SubtargetFeature<"fuse-wideimm", "HasWideImmFusion", "true", + "Target supports Wide-Immediate fusion", + [FeatureFusion]>; +def FeatureZeroMoveFusion: + SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true", + "Target supports move to SPR with branch fusion", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; @@ -393,7 +409,8 @@ // still exist with the exception of those we know are Power9 specific. list FusionFeatures = [ FeatureStoreFusion, FeatureAddLogicalFusion, FeatureLogicalAddFusion, - FeatureLogicalFusion, FeatureArithAddFusion + FeatureLogicalFusion, FeatureArithAddFusion, FeatureSha3Fusion, + FeatureZeroMoveFusion, FeatureWideImmFusion, FeatureCompareFusion, ]; list P10AdditionalFeatures = !listconcat(FusionFeatures, [ diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp @@ -149,6 +149,79 @@ case FusionFeature::FK_SldiAdd: return (matchingImmOps(FirstMI, 2, 3) && matchingImmOps(FirstMI, 3, 60)) || (matchingImmOps(FirstMI, 2, 6) && matchingImmOps(FirstMI, 3, 57)); + + // rldicl rx, ra, 1, 0 - xor + case FusionFeature::FK_RotateLeftXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 0); + + // rldicr rx, ra, 1, 63 - xor + case FusionFeature::FK_RotateRightXor: + return matchingImmOps(FirstMI, 2, 1) && matchingImmOps(FirstMI, 3, 63); + + // We actually use CMPW* and CMPD*, 'l' doesn't exist as an operand in instr. + + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } + // { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp1: + // { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } + // { ld,ldx } - cmpli 0,1,rx,{ 0,1 } + case FusionFeature::FK_LoadCmp2: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + if (SecondMI.getOpcode() == PPC::CMPDI && + matchingImmOps(SecondMI, 2, -1, 16)) + return true; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1); + } + + // { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } + case FusionFeature::FK_LoadCmp3: { + const MachineOperand &BT = SecondMI.getOperand(0); + if (!BT.isReg() || + (!Register::isVirtualRegister(BT.getReg()) && BT.getReg() != PPC::CR0)) + return false; + return matchingImmOps(SecondMI, 2, 0) || matchingImmOps(SecondMI, 2, 1) || + matchingImmOps(SecondMI, 2, -1, 16); + } + + // mtctr - { bcctr,bcctrl } + case FusionFeature::FK_ZeroMoveCTR: + // ( mtctr rx ) is alias of ( mtspr 9, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 9); + + // mtlr - { bclr,bclrl } + case FusionFeature::FK_ZeroMoveLR: + // ( mtlr rx ) is alias of ( mtspr 8, rx ) + return (FirstMI.getOpcode() != PPC::MTSPR && + FirstMI.getOpcode() != PPC::MTSPR8) || + matchingImmOps(FirstMI, 0, 8); + + // addis rx,ra,si - addi rt,rx,SI, SI >= 0 + case FusionFeature::FK_AddisAddi: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = SecondMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + return SignExtend64(SI.getImm(), 16) >= 0; + } + + // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 + case FusionFeature::FK_AddiAddis: { + const MachineOperand &RA = FirstMI.getOperand(1); + const MachineOperand &SI = FirstMI.getOperand(2); + if (!SI.isImm() || !RA.isReg()) + return false; + if (RA.getReg() == PPC::ZERO || RA.getReg() == PPC::ZERO8) + return false; + int64_t ExtendedSI = SignExtend64(SI.getImm(), 16); + return ExtendedSI >= 2; + } } llvm_unreachable("All the cases should have been handled"); diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def @@ -78,5 +78,80 @@ FUSION_FEATURE(SldiAdd, hasArithAddFusion, -1, FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(ADD4, ADD8, SUBF, SUBF8)) +// rldicl rx, ra, 1, 0 - xor +FUSION_FEATURE(RotateLeftXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICL, RLDICL_32, RLDICL_32_64), + FUSION_OP_SET(XOR, XOR8)) + +// rldicr rx, ra, 1, 63 - xor +FUSION_FEATURE(RotateRightXor, hasSha3Fusion, 1, + FUSION_OP_SET(RLDICR, RLDICR_32), FUSION_OP_SET(XOR, XOR8)) + +// There're two special cases in 'load-compare' series, so we have to split +// them into several pattern groups to fit into current framework. This can +// be clearer once we switched to a more expressive approach. + +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpi 0,1,rx,{ 0,1,-1 } +// { lbz,lbzx,lhz,lhzx,lwz,lwzx } - cmpli 0,L,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp1, hasCompareFusion, 1, + FUSION_OP_SET(LBZ, LBZ8, LBZX, LBZX8, LBZXTLS, LBZXTLS_, + LBZXTLS_32, LHZ, LHZ8, LHZX, LHZX8, LHZXTLS, + LHZXTLS_, LHZXTLS_32, LWZ, LWZ8, LWZX, LWZX8, + LWZXTLS, LWZXTLS_, LWZXTLS_32), + FUSION_OP_SET(CMPDI, CMPLDI, CMPLWI)) + +// { ld,ldx } - cmpi 0,1,rx,{ 0,1,-1 } +// { ld,ldx } - cmpli 0,1,rx,{ 0,1 } +FUSION_FEATURE(LoadCmp2, hasCompareFusion, 1, + FUSION_OP_SET(LD, LDX, LDXTLS, LDXTLS_), + FUSION_OP_SET(CMPDI, CMPLDI)) + +// { lha,lhax,lwa,lwax } - cmpi 0,L,rx,{ 0,1,-1 } +FUSION_FEATURE(LoadCmp3, hasCompareFusion, 1, + FUSION_OP_SET(LHA, LHA8, LHAX, LHAX8, LWA, LWA_32, LWAX, + LWAX_32), + FUSION_OP_SET(CMPLDI, CMPLWI)) + +// ori - oris +FUSION_FEATURE(OriOris, hasWideImmFusion, 1, FUSION_OP_SET(ORI, ORI8), + FUSION_OP_SET(ORIS, ORIS8)) + +// lis - ori +FUSION_FEATURE(LisOri, hasWideImmFusion, 1, FUSION_OP_SET(LIS, LIS8), + FUSION_OP_SET(ORI, ORI8)) + +// oris - ori +FUSION_FEATURE(OrisOri, hasWideImmFusion, 1, FUSION_OP_SET(ORIS, ORIS8), + FUSION_OP_SET(ORI, ORI8)) + +// xori - xoris +FUSION_FEATURE(XoriXoris, hasWideImmFusion, 1, FUSION_OP_SET(XORI, XORI8), + FUSION_OP_SET(XORIS, XORIS8)) + +// xoris - xori +FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8), + FUSION_OP_SET(XORI, XORI8)) + +// addis rx,ra,si - addi rt,rx,SI, SI >= 0 +FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1, + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), + FUSION_OP_SET(ADDI, ADDI8, ADDItocL)) + +// addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 +FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1, + FUSION_OP_SET(ADDI, ADDI8, ADDItocL), + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8)) + +// mtctr - { bcctr,bcctrl } +FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTCTR, MTCTRloop, MTSPR8, MTSPR), + FUSION_OP_SET(BCCTR, BCCTRn, BCCTR8, BCCTR8n, BCCTRL, BCCTRLn, + BCCTRL8, BCCTRL8n, gBCCTR, gBCCTRL)) + +// mtlr - { bclr,bclrl } +FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1, + FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR), + FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL)) + #undef FUSION_FEATURE #undef FUSION_OP_SET diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -151,6 +151,10 @@ bool HasAddLogicalFusion; bool HasLogicalAddFusion; bool HasLogicalFusion; + bool HasSha3Fusion; + bool HasCompareFusion; + bool HasWideImmFusion; + bool HasZeroMoveFusion; bool IsISA2_06; bool IsISA2_07; bool IsISA3_0; @@ -340,6 +344,10 @@ bool hasAddLogicalFusion() const { return HasAddLogicalFusion; } bool hasLogicalAddFusion() const { return HasLogicalAddFusion; } bool hasLogicalFusion() const { return HasLogicalFusion; } + bool hasCompareFusion() const { return HasCompareFusion; } + bool hasWideImmFusion() const { return HasWideImmFusion; } + bool hasSha3Fusion() const { return HasSha3Fusion; } + bool hasZeroMoveFusion() const { return HasZeroMoveFusion; } bool needsSwapsForVSXMemOps() const { return hasVSX() && isLittleEndian() && !hasP9Vector(); } diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -131,6 +131,10 @@ HasAddLogicalFusion = false; HasLogicalAddFusion = false; HasLogicalFusion = false; + HasSha3Fusion = false; + HasCompareFusion = false; + HasWideImmFusion = false; + HasZeroMoveFusion = false; IsISA2_06 = false; IsISA2_07 = false; IsISA3_0 = false; diff --git a/llvm/test/CodeGen/PowerPC/macro-fusion.mir b/llvm/test/CodeGen/PowerPC/macro-fusion.mir --- a/llvm/test/CodeGen/PowerPC/macro-fusion.mir +++ b/llvm/test/CodeGen/PowerPC/macro-fusion.mir @@ -93,3 +93,55 @@ renamable $x3 = ADD8 killed renamable $x4, $x5 BLR8 implicit $lr8, implicit $rm, implicit $x3 ... + +# CHECK: rldicl_xor:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / RLDICL - XOR8 +--- +name: rldicl_xor +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x4 = RLDICL $x3, 1, 0 + renamable $x3 = XOR8 killed renamable $x4, $x5 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: rldicr_xor:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / RLDICR - XOR8 +--- +name: rldicr_xor +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x4 = RLDICR $x3, 1, 63 + renamable $x3 = XOR8 killed renamable $x4, $x5 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: ori_oris:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / ORI8 - ORIS8 +--- +name: ori_oris +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4 + renamable $x4 = ORI8 $x3, 63 + renamable $x3 = ORIS8 killed renamable $x4, 20 + BLR8 implicit $lr8, implicit $rm, implicit $x3 +... + +# CHECK: load_cmp:%bb.0 +# CHECK: Macro fuse: SU(0) - SU(1) / LD - CMPDI +--- +name: load_cmp +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x3, $x4, $x5 + renamable $x3 = LD 0, killed renamable $x3 + renamable $cr0 = CMPDI killed renamable $x3, 0 + renamable $x3 = ISEL8 killed renamable $x5, killed renamable $x4, renamable $cr0lt, implicit killed $cr0 + BLR8 implicit $lr8, implicit $rm, implicit $x3 diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll --- a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll +++ b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll @@ -26,13 +26,13 @@ ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: mfcr r12 ; CHECK-NEXT: stw r12, 8(r1) -; CHECK-NEXT: ld r3, 0(r3) ; CHECK-NEXT: ld r4, 0(0) -; CHECK-NEXT: ld r5, 56(0) -; CHECK-NEXT: cmpdi cr1, r3, 0 -; CHECK-NEXT: cmpdi cr4, r4, 0 -; CHECK-NEXT: cmpdi cr6, r5, 0 +; CHECK-NEXT: ld r3, 0(r3) ; CHECK-NEXT: cmpldi r3, 0 +; CHECK-NEXT: cmpdi cr4, r4, 0 +; CHECK-NEXT: ld r4, 56(0) +; CHECK-NEXT: cmpdi cr1, r3, 0 +; CHECK-NEXT: cmpdi cr6, r4, 0 ; CHECK-NEXT: beq cr0, .LBB0_3 ; CHECK-NEXT: # %bb.1: # %bb10 ; CHECK-NEXT: lwz r3, 0(r3) diff --git a/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll b/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll --- a/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-rop-protection.ll @@ -2867,21 +2867,21 @@ ; LE-P10-LABEL: aligned: ; LE-P10: # %bb.0: # %entry ; LE-P10-NEXT: mflr r0 -; LE-P10-NEXT: lis r12, -1 ; LE-P10-NEXT: std r30, -16(r1) +; LE-P10-NEXT: lis r12, -1 +; LE-P10-NEXT: ori r12, r12, 0 ; LE-P10-NEXT: mr r30, r1 ; LE-P10-NEXT: std r0, 16(r1) ; LE-P10-NEXT: hashst r0, -32(r1) ; LE-P10-NEXT: clrldi r0, r1, 49 -; LE-P10-NEXT: ori r12, r12, 0 ; LE-P10-NEXT: subc r0, r12, r0 ; LE-P10-NEXT: stdux r1, r1, r0 ; LE-P10-NEXT: std r29, -24(r30) # 8-byte Folded Spill ; LE-P10-NEXT: mr r29, r3 ; LE-P10-NEXT: lwz r3, 4(r3) ; LE-P10-NEXT: lis r4, 0 -; LE-P10-NEXT: addi r5, r1, 32764 ; LE-P10-NEXT: ori r4, r4, 65500 +; LE-P10-NEXT: addi r5, r1, 32764 ; LE-P10-NEXT: stwx r3, r1, r4 ; LE-P10-NEXT: lwz r3, 12(r29) ; LE-P10-NEXT: lis r4, 0 @@ -3130,21 +3130,21 @@ ; BE-P10-LABEL: aligned: ; BE-P10: # %bb.0: # %entry ; BE-P10-NEXT: mflr r0 -; BE-P10-NEXT: lis r12, -1 ; BE-P10-NEXT: std r30, -16(r1) +; BE-P10-NEXT: lis r12, -1 +; BE-P10-NEXT: ori r12, r12, 0 ; BE-P10-NEXT: mr r30, r1 ; BE-P10-NEXT: std r0, 16(r1) ; BE-P10-NEXT: hashst r0, -32(r1) ; BE-P10-NEXT: clrldi r0, r1, 49 -; BE-P10-NEXT: ori r12, r12, 0 ; BE-P10-NEXT: subc r0, r12, r0 ; BE-P10-NEXT: stdux r1, r1, r0 ; BE-P10-NEXT: std r29, -24(r30) # 8-byte Folded Spill ; BE-P10-NEXT: mr r29, r3 ; BE-P10-NEXT: lwz r3, 4(r3) ; BE-P10-NEXT: lis r4, 0 -; BE-P10-NEXT: addi r5, r1, 32764 ; BE-P10-NEXT: ori r4, r4, 65500 +; BE-P10-NEXT: addi r5, r1, 32764 ; BE-P10-NEXT: stwx r3, r1, r4 ; BE-P10-NEXT: lwz r3, 12(r29) ; BE-P10-NEXT: lis r4, 0 @@ -3262,21 +3262,21 @@ ; LE-P10-PRIV-LABEL: aligned: ; LE-P10-PRIV: # %bb.0: # %entry ; LE-P10-PRIV-NEXT: mflr r0 -; LE-P10-PRIV-NEXT: lis r12, -1 ; LE-P10-PRIV-NEXT: std r30, -16(r1) +; LE-P10-PRIV-NEXT: lis r12, -1 +; LE-P10-PRIV-NEXT: ori r12, r12, 0 ; LE-P10-PRIV-NEXT: mr r30, r1 ; LE-P10-PRIV-NEXT: std r0, 16(r1) ; LE-P10-PRIV-NEXT: hashstp r0, -32(r1) ; LE-P10-PRIV-NEXT: clrldi r0, r1, 49 -; LE-P10-PRIV-NEXT: ori r12, r12, 0 ; LE-P10-PRIV-NEXT: subc r0, r12, r0 ; LE-P10-PRIV-NEXT: stdux r1, r1, r0 ; LE-P10-PRIV-NEXT: std r29, -24(r30) # 8-byte Folded Spill ; LE-P10-PRIV-NEXT: mr r29, r3 ; LE-P10-PRIV-NEXT: lwz r3, 4(r3) ; LE-P10-PRIV-NEXT: lis r4, 0 -; LE-P10-PRIV-NEXT: addi r5, r1, 32764 ; LE-P10-PRIV-NEXT: ori r4, r4, 65500 +; LE-P10-PRIV-NEXT: addi r5, r1, 32764 ; LE-P10-PRIV-NEXT: stwx r3, r1, r4 ; LE-P10-PRIV-NEXT: lwz r3, 12(r29) ; LE-P10-PRIV-NEXT: lis r4, 0 @@ -3393,21 +3393,21 @@ ; BE-P10-PRIV-LABEL: aligned: ; BE-P10-PRIV: # %bb.0: # %entry ; BE-P10-PRIV-NEXT: mflr r0 -; BE-P10-PRIV-NEXT: lis r12, -1 ; BE-P10-PRIV-NEXT: std r30, -16(r1) +; BE-P10-PRIV-NEXT: lis r12, -1 +; BE-P10-PRIV-NEXT: ori r12, r12, 0 ; BE-P10-PRIV-NEXT: mr r30, r1 ; BE-P10-PRIV-NEXT: std r0, 16(r1) ; BE-P10-PRIV-NEXT: hashstp r0, -32(r1) ; BE-P10-PRIV-NEXT: clrldi r0, r1, 49 -; BE-P10-PRIV-NEXT: ori r12, r12, 0 ; BE-P10-PRIV-NEXT: subc r0, r12, r0 ; BE-P10-PRIV-NEXT: stdux r1, r1, r0 ; BE-P10-PRIV-NEXT: std r29, -24(r30) # 8-byte Folded Spill ; BE-P10-PRIV-NEXT: mr r29, r3 ; BE-P10-PRIV-NEXT: lwz r3, 4(r3) ; BE-P10-PRIV-NEXT: lis r4, 0 -; BE-P10-PRIV-NEXT: addi r5, r1, 32764 ; BE-P10-PRIV-NEXT: ori r4, r4, 65500 +; BE-P10-PRIV-NEXT: addi r5, r1, 32764 ; BE-P10-PRIV-NEXT: stwx r3, r1, r4 ; BE-P10-PRIV-NEXT: lwz r3, 12(r29) ; BE-P10-PRIV-NEXT: lis r4, 0