[X86][SandyBridge] Additional updates to the SNB instructions scheduling information
ClosedPublic

Authored by gadi.haber on Aug 7 2017, 12:59 AM.

Details

Summary

This is a continuation patch for commit r307529 which completely replaces the scheduling information for the SandyBridge architecture target by modifying the file X86SchedSandyBridge.td located under the X86 Target (see also https://reviews.llvm.org/D35019).

In this patch we added the scheduling information of additional SNB instructions that were missing from the patch commit r307529, fixed the scheduling of several resource groups that include only port0 instead of port05 (i.e., port0 OR port5) and fixed several incorrect instructions' scheduling in the r307529 commit.

The patch also includes the X87 instructions which were missing in previous patch commit r307529 as reported in bugzilla bug 34080.

Diff Detail

Repository
rL LLVM
gadi.haber created this revision.Aug 7 2017, 12:59 AM

LGTM
(Still not accepting to give the other reviewers chance to comment).

The patch also includes the X87 instructions which were missing in previous patch commit r307529 as reported in bugzilla bug 34080.

Just to be clear, this doesn't fix PR34080, it just reschedules the codegen but still doesn't enforce the x86 cw barriers that we need - but then this is a scheduler so all its done is expose the issue, not caused it.

RKSimon added a reviewer: dim.Aug 9 2017, 2:14 AM
RKSimon added a subscriber: dim.

@dim What effect does this patch have on PR34080? The changes in pr34080.ll aren't looking hopeful

zvi added inline comments.Aug 9 2017, 5:47 AM
lib/Target/X86/X86SchedSandyBridge.td
1019 ↗(On Diff #109955)

Why was VBROADCASTF128 removed?go?

zvi added a comment.Aug 9 2017, 6:10 AM

Please disregard my previous comment. I found a duplicate of VBROADCASTF128 in line 1524

dim added a comment.Aug 12 2017, 12:52 PM

@dim What effect does this patch have on PR34080? The changes in pr34080.ll aren't looking hopeful

It seems to fix my minimized test case, e.g.:

#include <stdio.h>

__attribute__((noinline)) int g(double *tx) {
  int bad = (tx[2] != 0x1.93p+16);
  printf("tx[0]=%.20a tx[1]=%.20a tx[2]=%.20a: %s\n", tx[0], tx[1], tx[2], bad ? "Bad!" : "OK");
  return bad;
}

__attribute__((noinline)) int f(long double z) {
  int i;
  double tx[3];
  for (i = 0; i < 2; i++) {
    tx[i] = (double)((int)(z));
    z = (z - tx[i]) * 1.6777216e+07;
    //printf("z=%.20La\n", z);
  }
  tx[2] = z;
  return g(tx);
}

int main(void)
{
  return f(0x1.b2f3ee96e7600326p+23L);
}

Although the resulting assembly is very slightly different from before rL307529. So with trunk rL307528, it gives (only the body of function f shown):

fldt    64(%rsp)
fnstcw  6(%rsp)
movzwl  6(%rsp), %eax
movw    $3199, 6(%rsp)          # imm = 0xC7F
fldcw   6(%rsp)
movw    %ax, 6(%rsp)
fistl   8(%rsp)
fldcw   6(%rsp)
cvtsi2sdl       8(%rsp), %xmm0
movsd   %xmm0, 32(%rsp)
movsd   %xmm0, 24(%rsp)
fsubl   24(%rsp)
flds    .LCPI1_0(%rip)
fmul    %st(0), %st(1)
fnstcw  4(%rsp)
movzwl  4(%rsp), %eax
movw    $3199, 4(%rsp)          # imm = 0xC7F
fldcw   4(%rsp)
movw    %ax, 4(%rsp)
fxch    %st(1)
fistl   12(%rsp)
fldcw   4(%rsp)
xorps   %xmm0, %xmm0
cvtsi2sdl       12(%rsp), %xmm0
movsd   %xmm0, 40(%rsp)
movsd   %xmm0, 16(%rsp)
fsubl   16(%rsp)
fmulp   %st(1)
fstpl   48(%rsp)
leaq    32(%rsp), %rdi
callq   g
addq    $56, %rsp
retq

With trunk rL307529 through rL310782, it gives:

fnstcw  6(%rsp)
movzwl  6(%rsp), %eax
movw    $3199, 6(%rsp)          # imm = 0xC7F
fldcw   6(%rsp)
fldt    64(%rsp)
movw    %ax, 6(%rsp)
fistl   8(%rsp)
fldcw   6(%rsp)
cvtsi2sdl       8(%rsp), %xmm0
movsd   %xmm0, 32(%rsp)
movsd   %xmm0, 24(%rsp)
fsubl   24(%rsp)
fnstcw  4(%rsp)
flds    .LCPI1_0(%rip)
movzwl  4(%rsp), %eax
movw    $3199, 4(%rsp)          # imm = 0xC7F
fldcw   4(%rsp)
fmul    %st(0), %st(1)
movw    %ax, 4(%rsp)
fxch    %st(1)
fistl   12(%rsp)
fldcw   4(%rsp)
xorps   %xmm0, %xmm0
cvtsi2sdl       12(%rsp), %xmm0
movsd   %xmm0, 40(%rsp)
movsd   %xmm0, 16(%rsp)
fsubl   16(%rsp)
fmulp   %st(1)
fstpl   48(%rsp)
leaq    32(%rsp), %rdi
callq   g
addq    $56, %rsp
retq

Where the most important change (and the source of errors) is that the flds .LCPI1_0(%rip) and fmul %st(0), %st(1) are moved apart from each other.

However, applying D36388 to rL310782 results in:

fnstcw  6(%rsp)
fldt    64(%rsp)
movzwl  6(%rsp), %eax
movw    $3199, 6(%rsp)          # imm = 0xC7F
fldcw   6(%rsp)
movw    %ax, 6(%rsp)
fistl   8(%rsp)
fldcw   6(%rsp)
cvtsi2sdl       8(%rsp), %xmm0
movsd   %xmm0, 32(%rsp)
movsd   %xmm0, 24(%rsp)
fsubl   24(%rsp)
flds    .LCPI1_0(%rip)
fnstcw  4(%rsp)
fmul    %st(0), %st(1)
movzwl  4(%rsp), %eax
movw    $3199, 4(%rsp)          # imm = 0xC7F
fldcw   4(%rsp)
movw    %ax, 4(%rsp)
fxch    %st(1)
fistl   12(%rsp)
fldcw   4(%rsp)
xorps   %xmm0, %xmm0
cvtsi2sdl       12(%rsp), %xmm0
movsd   %xmm0, 40(%rsp)
movsd   %xmm0, 16(%rsp)
fsubl   16(%rsp)
fmulp   %st(1)
fstpl   48(%rsp)
leaq    32(%rsp), %rdi
callq   g
addq    $56, %rsp
retq

So the flds .LCPI1_0(%rip) and fmul %st(0), %st(1) are now only interspersed with a fnstcw 4(%rsp), which does not seem to affect the outcome.

Diff of the assembly output of stock rL307528 and rL310782 with D36388:

--- pio2n-r307528.s
+++ pio2n-r310782-D36388.s
@@ -53,8 +53,8 @@
        subq    $56, %rsp
 .Lcfi2:
        .cfi_def_cfa_offset 64
-       fldt    64(%rsp)
        fnstcw  6(%rsp)
+       fldt    64(%rsp)
        movzwl  6(%rsp), %eax
        movw    $3199, 6(%rsp)          # imm = 0xC7F
        fldcw   6(%rsp)
@@ -66,8 +66,8 @@
        movsd   %xmm0, 24(%rsp)
        fsubl   24(%rsp)
        flds    .LCPI1_0(%rip)
-       fmul    %st(0), %st(1)
        fnstcw  4(%rsp)
+       fmul    %st(0), %st(1)
        movzwl  4(%rsp), %eax
        movw    $3199, 4(%rsp)          # imm = 0xC7F
        fldcw   4(%rsp)
zvi accepted this revision.Aug 13 2017, 1:44 AM

LGTM

This revision is now accepted and ready to land.Aug 13 2017, 1:44 AM
This revision was automatically updated to reflect the committed changes.

FYI, this shows a *very* substantial regression (>20% for some file formats and operations) for us on a compression benchmark. It is similar to the open source Snappy code. The open source Snappy benchmark regresses as well, but less severe: https://github.com/google/snappy

We're looking at why.

20% is high. Try to verify if this is not code alignment related.

We verified that the regression we saw internally is due to loop alignment.

I see. Code alignment in X86 is indeed an issue.
There are currently two pending reviews for patches that start providing a fix for the code alignment issue in X86.
The reviews are in a pending state for quite some time now.
Maybe we can give them higher priority and get them in?

The first patch (at https://reviews.llvm.org/D34393) is an NFC that provides the needed infrastructure for inserting padding code (NOPs) aimed only for performance stability. This will give us the ability to insert fixes for cases of regression caused by code alignment effects.

The second patch (at https://reviews.llvm.org/D34396) already uses the above infrastructure to handle the first case of regressions caused by misprediction of branches to same target addresses in a loop = which are resolved by appropriate code padding.

hans added a subscriber: hans.Aug 22 2017, 11:27 AM

This was proposed for the 5.0 branch as a fix for PR34080. But as it's really big and there seems to problems, I'm not keen to take it.

Would it be possible to break out the "The patch also includes the X87 instructions" part into a separate patch?

Unfortunately, splitting the patch creates an even worse case of unstable scheduling which produces regressions.
I am working on fine tuning the existing scheduling - which I hope to put on review soon.

Note that there may be a better solution here.
Currently, the default -x86_64 default flag is set to SandyBridge.
I would recommend to try changing it to Haswell and see if you get a better scheduling.

Here is the relevant setting in X86.td under LLVM lib Target X86:

We currently use the Sandy Bridge model as the default scheduling model as
we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
covers a huge swath of x86 processors. If there are specific scheduling
knobs which need to be tuned differently for AMD chips, we might consider
// forming a common base for them.
def : ProcessorModel<"x86-64", SandyBridgeModel,

[FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR,
 Feature64Bit, FeatureSlowBTMem ]>;
hans added a comment.Aug 23 2017, 2:20 PM
In D36388#848960, @hans wrote:

This was proposed for the 5.0 branch as a fix for PR34080. But as it's really big and there seems to problems, I'm not keen to take it.

Would it be possible to break out the "The patch also includes the X87 instructions" part into a separate patch?

I've reverted the original scheduling change that introduced the problem, r307529, from the 5.0 branch in r311600 to unblock the release.