diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -35,9 +35,10 @@ // with large values here the compilation of certain loops // ends up taking way too long. // Ideally for znver4, we should have 6.75K. However we don't add that - // considerting the impact compile time and prefer using default values + // considerting the impact compile time and prefer using default values // instead. - // let LoopMicroOpBufferSize = 6750; + // Retaining minimal value to influence unrolling as we did for znver3. + let LoopMicroOpBufferSize = 512; // AMD SOG 19h, 2.6.2 L1 Data Cache // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. // AMD SOG 19h, 2.12 L1 Data Cache @@ -1538,7 +1539,7 @@ let NumMicroOps = 1; } def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex - "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", + "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" )>; @@ -1790,20 +1791,20 @@ let ResourceCycles = [2]; let NumMicroOps = 1; } -def : InstRW<[Zn4VecALUZSlow], (instrs - VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr, - VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk, - VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz, - VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr, - VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk, - VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz, - VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr, - VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk, - VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz, - VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr, - VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk, - VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz, - VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr, +def : InstRW<[Zn4VecALUZSlow], (instrs + VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr, + VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk, + VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz, + VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr, + VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk, + VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz, + VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr, + VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk, + VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz, + VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr, + VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk, + VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz, + VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr, VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz )>; diff --git a/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll b/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll --- a/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll +++ b/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll @@ -1,5 +1,6 @@ ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=TTI -pass-remarks-analysis=TTI < %s -S 2>&1 | FileCheck --check-prefixes=ALL,TTI %s +; RUN: opt -passes=debugify,loop-unroll -mcpu=znver4 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu"