Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -135,6 +135,30 @@ defm : JWriteResIntPair; +def WriteSHLDrri : SchedWriteRes<[JALU01]> { + let Latency = 3; + let ResourceCycles = [6]; + let NumMicroOps = 6; +} +def: InstRW<[WriteSHLDrri], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[WriteSHLDrri], (instregex "SHRD(16|32|64)rri8")>; + +def WriteSHLDrrCL : SchedWriteRes<[JALU01]> { + let Latency = 4; + let ResourceCycles = [8]; + let NumMicroOps = 7; +} +def: InstRW<[WriteSHLDrrCL], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[WriteSHLDrrCL], (instregex "SHRD(16|32|64)rrCL")>; + +def WriteSHLDLd : SchedWriteRes<[JALU01]> { + let Latency = 9; + let ResourceCycles = [22]; + let NumMicroOps = 8; +} +def: InstRW<[WriteSHLDLd], (instregex "SHLD(16|32|64)mr(i8|CL)")>; +def: InstRW<[WriteSHLDLd], (instregex "SHRD(16|32|64)mr(i8|CL)")>; + //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. // FIXME: Split x86 and SSE load/store/moves Index: test/CodeGen/X86/schedule-x86-64-shld.ll =================================================================== --- test/CodeGen/X86/schedule-x86-64-shld.ll +++ test/CodeGen/X86/schedule-x86-64-shld.ll @@ -0,0 +1,687 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER1 + + +; clang -Oz -c test1.cpp -emit-llvm -S -o +; Verify that we generate shld insruction when we are optimizing for size, +; even for X86_64 processors that are known to have poor latency double +; precision shift instructions. +; uint64_t lshift10(uint64_t a, uint64_t b) +; { +; return (a << 10) | (b >> 54); +; } + +; Function Attrs: minsize nounwind readnone uwtable +define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 { +; GENERIC-LABEL: _Z8lshift10mm: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: shldq $10, %rsi, %rdi # sched: [2:0.67] +; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: _Z8lshift10mm: +; ATOM: # BB#0: # %entry +; ATOM-NEXT: shldq $10, %rsi, %rdi # sched: [9:4.50] +; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: _Z8lshift10mm: +; SLM: # BB#0: # %entry +; SLM-NEXT: shldq $10, %rsi, %rdi # sched: [1:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: _Z8lshift10mm: +; SANDY: # BB#0: # %entry +; SANDY-NEXT: shldq $10, %rsi, %rdi # sched: [2:0.67] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: _Z8lshift10mm: +; HASWELL: # BB#0: # %entry +; HASWELL-NEXT: shldq $10, %rsi, %rdi # sched: [3:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: _Z8lshift10mm: +; BROADWELL: # BB#0: # %entry +; BROADWELL-NEXT: shldq $10, %rsi, %rdi # sched: [3:1.00] +; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: _Z8lshift10mm: +; SKYLAKE: # BB#0: # %entry +; SKYLAKE-NEXT: shldq $10, %rsi, %rdi # sched: [3:1.00] +; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: _Z8lshift10mm: +; SKX: # BB#0: # %entry +; SKX-NEXT: shldq $10, %rsi, %rdi # sched: [3:1.00] +; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: _Z8lshift10mm: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: shldq $10, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: _Z8lshift10mm: +; ZNVER1: # BB#0: # %entry +; ZNVER1-NEXT: shldq $10, %rsi, %rdi # sched: [1:0.25] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] +; +; BDVER1-LABEL: _Z8lshift10mm: +; BDVER1: # BB#0: # %entry +; BDVER1-NEXT: shldq $10, %rsi, %rdi +; BDVER1-NEXT: movq %rdi, %rax +; BDVER1-NEXT: retq +entry: + %shl = shl i64 %a, 10 + %shr = lshr i64 %b, 54 + %or = or i64 %shr, %shl + ret i64 %or +} + +attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + + +; clang -Os -c test2.cpp -emit-llvm -S +; Verify that we generate shld insruction when we are optimizing for size, +; even for X86_64 processors that are known to have poor latency double +; precision shift instructions. +; uint64_t lshift11(uint64_t a, uint64_t b) +; { +; return (a << 11) | (b >> 53); +; } + +; Function Attrs: nounwind optsize readnone uwtable +define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 { +; GENERIC-LABEL: _Z8lshift11mm: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: shldq $11, %rsi, %rdi # sched: [2:0.67] +; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: _Z8lshift11mm: +; ATOM: # BB#0: # %entry +; ATOM-NEXT: shldq $11, %rsi, %rdi # sched: [9:4.50] +; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: _Z8lshift11mm: +; SLM: # BB#0: # %entry +; SLM-NEXT: shldq $11, %rsi, %rdi # sched: [1:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: _Z8lshift11mm: +; SANDY: # BB#0: # %entry +; SANDY-NEXT: shldq $11, %rsi, %rdi # sched: [2:0.67] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: _Z8lshift11mm: +; HASWELL: # BB#0: # %entry +; HASWELL-NEXT: shldq $11, %rsi, %rdi # sched: [3:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: _Z8lshift11mm: +; BROADWELL: # BB#0: # %entry +; BROADWELL-NEXT: shldq $11, %rsi, %rdi # sched: [3:1.00] +; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: _Z8lshift11mm: +; SKYLAKE: # BB#0: # %entry +; SKYLAKE-NEXT: shldq $11, %rsi, %rdi # sched: [3:1.00] +; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: _Z8lshift11mm: +; SKX: # BB#0: # %entry +; SKX-NEXT: shldq $11, %rsi, %rdi # sched: [3:1.00] +; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: _Z8lshift11mm: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: shldq $11, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: _Z8lshift11mm: +; ZNVER1: # BB#0: # %entry +; ZNVER1-NEXT: shldq $11, %rsi, %rdi # sched: [1:0.25] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] +; +; BDVER1-LABEL: _Z8lshift11mm: +; BDVER1: # BB#0: # %entry +; BDVER1-NEXT: shldq $11, %rsi, %rdi +; BDVER1-NEXT: movq %rdi, %rax +; BDVER1-NEXT: retq +entry: + %shl = shl i64 %a, 11 + %shr = lshr i64 %b, 53 + %or = or i64 %shr, %shl + ret i64 %or +} + +attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; clang -O2 -c test2.cpp -emit-llvm -S +; Verify that we do not generate shld insruction when we are not optimizing +; for size for X86_64 processors that are known to have poor latency double +; precision shift instructions. +; uint64_t lshift12(uint64_t a, uint64_t b) +; { +; return (a << 12) | (b >> 52); +; } + +; Function Attrs: nounwind optsize readnone uwtable +define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 { +; GENERIC-LABEL: _Z8lshift12mm: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: shldq $12, %rsi, %rdi # sched: [2:0.67] +; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: _Z8lshift12mm: +; ATOM: # BB#0: # %entry +; ATOM-NEXT: shldq $12, %rsi, %rdi # sched: [9:4.50] +; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: _Z8lshift12mm: +; SLM: # BB#0: # %entry +; SLM-NEXT: shldq $12, %rsi, %rdi # sched: [1:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: _Z8lshift12mm: +; SANDY: # BB#0: # %entry +; SANDY-NEXT: shldq $12, %rsi, %rdi # sched: [2:0.67] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: _Z8lshift12mm: +; HASWELL: # BB#0: # %entry +; HASWELL-NEXT: shldq $12, %rsi, %rdi # sched: [3:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: _Z8lshift12mm: +; BROADWELL: # BB#0: # %entry +; BROADWELL-NEXT: shldq $12, %rsi, %rdi # sched: [3:1.00] +; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: _Z8lshift12mm: +; SKYLAKE: # BB#0: # %entry +; SKYLAKE-NEXT: shldq $12, %rsi, %rdi # sched: [3:1.00] +; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: _Z8lshift12mm: +; SKX: # BB#0: # %entry +; SKX-NEXT: shldq $12, %rsi, %rdi # sched: [3:1.00] +; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: _Z8lshift12mm: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: shlq $12, %rdi # sched: [1:0.50] +; BTVER2-NEXT: shrq $52, %rsi # sched: [1:0.50] +; BTVER2-NEXT: leaq (%rsi,%rdi), %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: _Z8lshift12mm: +; ZNVER1: # BB#0: # %entry +; ZNVER1-NEXT: shlq $12, %rdi # sched: [1:0.25] +; ZNVER1-NEXT: shrq $52, %rsi # sched: [1:0.25] +; ZNVER1-NEXT: leaq (%rsi,%rdi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] +; +; BDVER1-LABEL: _Z8lshift12mm: +; BDVER1: # BB#0: # %entry +; BDVER1-NEXT: shlq $12, %rdi +; BDVER1-NEXT: shrq $52, %rsi +; BDVER1-NEXT: leaq (%rsi,%rdi), %rax +; BDVER1-NEXT: retq +entry: + %shl = shl i64 %a, 12 + %shr = lshr i64 %b, 52 + %or = or i64 %shr, %shl + ret i64 %or +} + +attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; Verify that for the X86_64 processors that are known to have poor latency +; double precision shift instructions we do not generate 'shld' or 'shrd' +; instructions. + +;uint64_t lshift(uint64_t a, uint64_t b, int c) +;{ +; return (a << c) | (b >> (64-c)); +;} + +define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone { +; GENERIC-LABEL: lshift: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: movl %edx, %ecx # sched: [1:0.33] +; GENERIC-NEXT: shldq %cl, %rsi, %rdi # sched: [4:1.50] +; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: lshift: +; ATOM: # BB#0: # %entry +; ATOM-NEXT: movl %edx, %ecx # sched: [1:0.50] +; ATOM-NEXT: shldq %cl, %rsi, %rdi # sched: [8:4.00] +; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: lshift: +; SLM: # BB#0: # %entry +; SLM-NEXT: movl %edx, %ecx # sched: [1:0.50] +; SLM-NEXT: shldq %cl, %rsi, %rdi # sched: [1:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: lshift: +; SANDY: # BB#0: # %entry +; SANDY-NEXT: movl %edx, %ecx # sched: [1:0.33] +; SANDY-NEXT: shldq %cl, %rsi, %rdi # sched: [4:1.50] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: lshift: +; HASWELL: # BB#0: # %entry +; HASWELL-NEXT: movl %edx, %ecx # sched: [1:0.25] +; HASWELL-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: lshift: +; BROADWELL: # BB#0: # %entry +; BROADWELL-NEXT: movl %edx, %ecx # sched: [1:0.25] +; BROADWELL-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: lshift: +; SKYLAKE: # BB#0: # %entry +; SKYLAKE-NEXT: movl %edx, %ecx # sched: [1:0.25] +; SKYLAKE-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: lshift: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl %edx, %ecx # sched: [1:0.25] +; SKX-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: lshift: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: movl %edx, %ecx # sched: [1:0.17] +; BTVER2-NEXT: shlq %cl, %rdi # sched: [1:0.50] +; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.17] +; BTVER2-NEXT: subl %edx, %ecx # sched: [1:0.50] +; BTVER2-NEXT: # kill: %CL %CL %ECX +; BTVER2-NEXT: shrq %cl, %rsi # sched: [1:0.50] +; BTVER2-NEXT: orq %rdi, %rsi # sched: [1:0.50] +; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: lshift: +; ZNVER1: # BB#0: # %entry +; ZNVER1-NEXT: movl $64, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %EDX %EDX %RDX +; ZNVER1-NEXT: shlxq %rdx, %rdi, %rcx # sched: [1:0.25] +; ZNVER1-NEXT: subl %edx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: shrxq %rax, %rsi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] +; +; BDVER1-LABEL: lshift: +; BDVER1: # BB#0: # %entry +; BDVER1-NEXT: movl %edx, %ecx +; BDVER1-NEXT: shlq %cl, %rdi +; BDVER1-NEXT: movl $64, %ecx +; BDVER1-NEXT: subl %edx, %ecx +; BDVER1-NEXT: # kill: %CL %CL %ECX +; BDVER1-NEXT: shrq %cl, %rsi +; BDVER1-NEXT: orq %rdi, %rsi +; BDVER1-NEXT: movq %rsi, %rax +; BDVER1-NEXT: retq +entry: + %sh_prom = zext i32 %c to i64 + %shl = shl i64 %a, %sh_prom + %sub = sub nsw i32 64, %c + %sh_prom1 = zext i32 %sub to i64 + %shr = lshr i64 %b, %sh_prom1 + %or = or i64 %shr, %shl + ret i64 %or +} + +;uint64_t rshift(uint64_t a, uint64_t b, int c) +;{ +; return (a >> c) | (b << (64-c)); +;} + +define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone { +; GENERIC-LABEL: rshift: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: movl %edx, %ecx # sched: [1:0.33] +; GENERIC-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:1.50] +; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: rshift: +; ATOM: # BB#0: # %entry +; ATOM-NEXT: movl %edx, %ecx # sched: [1:0.50] +; ATOM-NEXT: shrdq %cl, %rsi, %rdi # sched: [8:4.00] +; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: rshift: +; SLM: # BB#0: # %entry +; SLM-NEXT: movl %edx, %ecx # sched: [1:0.50] +; SLM-NEXT: shrdq %cl, %rsi, %rdi # sched: [1:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: rshift: +; SANDY: # BB#0: # %entry +; SANDY-NEXT: movl %edx, %ecx # sched: [1:0.33] +; SANDY-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:1.50] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: rshift: +; HASWELL: # BB#0: # %entry +; HASWELL-NEXT: movl %edx, %ecx # sched: [1:0.25] +; HASWELL-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: rshift: +; BROADWELL: # BB#0: # %entry +; BROADWELL-NEXT: movl %edx, %ecx # sched: [1:0.25] +; BROADWELL-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00] +; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: rshift: +; SKYLAKE: # BB#0: # %entry +; SKYLAKE-NEXT: movl %edx, %ecx # sched: [1:0.25] +; SKYLAKE-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00] +; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: rshift: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl %edx, %ecx # sched: [1:0.25] +; SKX-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00] +; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: rshift: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: movl %edx, %ecx # sched: [1:0.17] +; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50] +; BTVER2-NEXT: movl $64, %ecx # sched: [1:0.17] +; BTVER2-NEXT: subl %edx, %ecx # sched: [1:0.50] +; BTVER2-NEXT: # kill: %CL %CL %ECX +; BTVER2-NEXT: shlq %cl, %rsi # sched: [1:0.50] +; BTVER2-NEXT: orq %rdi, %rsi # sched: [1:0.50] +; BTVER2-NEXT: movq %rsi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: rshift: +; ZNVER1: # BB#0: # %entry +; ZNVER1-NEXT: movl $64, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %EDX %EDX %RDX +; ZNVER1-NEXT: shrxq %rdx, %rdi, %rcx # sched: [1:0.25] +; ZNVER1-NEXT: subl %edx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: shlxq %rax, %rsi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] +; +; BDVER1-LABEL: rshift: +; BDVER1: # BB#0: # %entry +; BDVER1-NEXT: movl %edx, %ecx +; BDVER1-NEXT: shrq %cl, %rdi +; BDVER1-NEXT: movl $64, %ecx +; BDVER1-NEXT: subl %edx, %ecx +; BDVER1-NEXT: # kill: %CL %CL %ECX +; BDVER1-NEXT: shlq %cl, %rsi +; BDVER1-NEXT: orq %rdi, %rsi +; BDVER1-NEXT: movq %rsi, %rax +; BDVER1-NEXT: retq +entry: + %sh_prom = zext i32 %c to i64 + %shr = lshr i64 %a, %sh_prom + %sub = sub nsw i32 64, %c + %sh_prom1 = zext i32 %sub to i64 + %shl = shl i64 %b, %sh_prom1 + %or = or i64 %shl, %shr + ret i64 %or +} + +; Verify that for the X86_64 processors that are known to have poor latency +; double precision shift instructions we do not generate 'shld' or 'shrd' +; instructions. + +;uint64_t lshift(uint64_t a, uint64_t b, int c) +;{ +; return (a << c) | (b >> (64-c)); +;} + +define i64 @lshift_optsize(i64 %a, i64 %b, i32 %c) nounwind readnone optsize { +; GENERIC-LABEL: lshift_optsize: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: movl %edx, %ecx # sched: [1:0.33] +; GENERIC-NEXT: shldq %cl, %rsi, %rdi # sched: [4:1.50] +; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: lshift_optsize: +; ATOM: # BB#0: # %entry +; ATOM-NEXT: movl %edx, %ecx # sched: [1:0.50] +; ATOM-NEXT: shldq %cl, %rsi, %rdi # sched: [8:4.00] +; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: lshift_optsize: +; SLM: # BB#0: # %entry +; SLM-NEXT: movl %edx, %ecx # sched: [1:0.50] +; SLM-NEXT: shldq %cl, %rsi, %rdi # sched: [1:1.00] +; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: lshift_optsize: +; SANDY: # BB#0: # %entry +; SANDY-NEXT: movl %edx, %ecx # sched: [1:0.33] +; SANDY-NEXT: shldq %cl, %rsi, %rdi # sched: [4:1.50] +; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: lshift_optsize: +; HASWELL: # BB#0: # %entry +; HASWELL-NEXT: movl %edx, %ecx # sched: [1:0.25] +; HASWELL-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: lshift_optsize: +; BROADWELL: # BB#0: # %entry +; BROADWELL-NEXT: movl %edx, %ecx # sched: [1:0.25] +; BROADWELL-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: lshift_optsize: +; SKYLAKE: # BB#0: # %entry +; SKYLAKE-NEXT: movl %edx, %ecx # sched: [1:0.25] +; SKYLAKE-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: lshift_optsize: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl %edx, %ecx # sched: [1:0.25] +; SKX-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00] +; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: lshift_optsize: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: movl %edx, %ecx # sched: [1:0.17] +; BTVER2-NEXT: shldq %cl, %rsi, %rdi # sched: [4:4.00] +; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: lshift_optsize: +; ZNVER1: # BB#0: # %entry +; ZNVER1-NEXT: movl %edx, %ecx # sched: [1:0.25] +; ZNVER1-NEXT: shldq %cl, %rsi, %rdi # sched: [100:?] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [1:0.50] +; +; BDVER1-LABEL: lshift_optsize: +; BDVER1: # BB#0: # %entry +; BDVER1-NEXT: movl %edx, %ecx +; BDVER1-NEXT: shldq %cl, %rsi, %rdi +; BDVER1-NEXT: movq %rdi, %rax +; BDVER1-NEXT: retq +entry: + %sh_prom = zext i32 %c to i64 + %shl = shl i64 %a, %sh_prom + %sub = sub nsw i32 64, %c + %sh_prom1 = zext i32 %sub to i64 + %shr = lshr i64 %b, %sh_prom1 + %or = or i64 %shr, %shl + ret i64 %or +} + +@x = global i64 0, align 4 + +; TODO: we should use SHLD64mrCL instruction like +; shldq %cl, %rdi, {{.*}}(%rip) +define void @lshift_optsize_mem(i64 %a, i32 %c) nounwind readnone optsize { +; GENERIC-LABEL: lshift_optsize_mem: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; GENERIC-NEXT: movl %esi, %ecx # sched: [1:0.33] +; GENERIC-NEXT: shldq %cl, %rax, %rdi # sched: [4:1.50] +; GENERIC-NEXT: movq %rdi, {{.*}}(%rip) # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; ATOM-LABEL: lshift_optsize_mem: +; ATOM: # BB#0: # %entry +; ATOM-NEXT: movq {{.*}}(%rip), %rax # sched: [1:1.00] +; ATOM-NEXT: movl %esi, %ecx # sched: [1:0.50] +; ATOM-NEXT: shldq %cl, %rax, %rdi # sched: [8:4.00] +; ATOM-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; ATOM-NEXT: retq # sched: [79:39.50] +; +; SLM-LABEL: lshift_optsize_mem: +; SLM: # BB#0: # %entry +; SLM-NEXT: movq {{.*}}(%rip), %rax # sched: [3:1.00] +; SLM-NEXT: movl %esi, %ecx # sched: [1:0.50] +; SLM-NEXT: shldq %cl, %rax, %rdi # sched: [1:1.00] +; SLM-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: lshift_optsize_mem: +; SANDY: # BB#0: # %entry +; SANDY-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; SANDY-NEXT: movl %esi, %ecx # sched: [1:0.33] +; SANDY-NEXT: shldq %cl, %rax, %rdi # sched: [4:1.50] +; SANDY-NEXT: movq %rdi, {{.*}}(%rip) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] +; +; HASWELL-LABEL: lshift_optsize_mem: +; HASWELL: # BB#0: # %entry +; HASWELL-NEXT: movq {{.*}}(%rip), %rax # sched: [1:0.50] +; HASWELL-NEXT: movl %esi, %ecx # sched: [1:0.25] +; HASWELL-NEXT: shldq %cl, %rax, %rdi # sched: [6:1.00] +; HASWELL-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] +; +; BROADWELL-LABEL: lshift_optsize_mem: +; BROADWELL: # BB#0: # %entry +; BROADWELL-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; BROADWELL-NEXT: movl %esi, %ecx # sched: [1:0.25] +; BROADWELL-NEXT: shldq %cl, %rax, %rdi # sched: [6:1.00] +; BROADWELL-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; BROADWELL-NEXT: retq # sched: [7:1.00] +; +; SKYLAKE-LABEL: lshift_optsize_mem: +; SKYLAKE: # BB#0: # %entry +; SKYLAKE-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; SKYLAKE-NEXT: movl %esi, %ecx # sched: [1:0.25] +; SKYLAKE-NEXT: shldq %cl, %rax, %rdi # sched: [6:1.00] +; SKYLAKE-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; SKYLAKE-NEXT: retq # sched: [7:1.00] +; +; SKX-LABEL: lshift_optsize_mem: +; SKX: # BB#0: # %entry +; SKX-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; SKX-NEXT: movl %esi, %ecx # sched: [1:0.25] +; SKX-NEXT: shldq %cl, %rax, %rdi # sched: [6:1.00] +; SKX-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] +; +; BTVER2-LABEL: lshift_optsize_mem: +; BTVER2: # BB#0: # %entry +; BTVER2-NEXT: movq {{.*}}(%rip), %rax # sched: [5:1.00] +; BTVER2-NEXT: movl %esi, %ecx # sched: [1:0.17] +; BTVER2-NEXT: shldq %cl, %rax, %rdi # sched: [4:4.00] +; BTVER2-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: lshift_optsize_mem: +; ZNVER1: # BB#0: # %entry +; ZNVER1-NEXT: movq {{.*}}(%rip), %rax # sched: [8:0.50] +; ZNVER1-NEXT: movl %esi, %ecx # sched: [1:0.25] +; ZNVER1-NEXT: shldq %cl, %rax, %rdi # sched: [100:?] +; ZNVER1-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [1:0.50] +; +; BDVER1-LABEL: lshift_optsize_mem: +; BDVER1: # BB#0: # %entry +; BDVER1-NEXT: movq {{.*}}(%rip), %rax +; BDVER1-NEXT: movl %esi, %ecx +; BDVER1-NEXT: shldq %cl, %rax, %rdi +; BDVER1-NEXT: movq %rdi, {{.*}}(%rip) +; BDVER1-NEXT: retq +entry: + %sh_prom = zext i32 %c to i64 + %shl = shl i64 %a, %sh_prom + %sub = sub nsw i32 64, %c + %sh_prom1 = zext i32 %sub to i64 + %b = load i64, i64* @x + %shr = lshr i64 %b, %sh_prom1 + %or = or i64 %shl, %shr + store i64 %or, i64* @x + ret void +} Index: test/CodeGen/X86/schedule-x86_64.ll =================================================================== --- test/CodeGen/X86/schedule-x86_64.ll +++ test/CodeGen/X86/schedule-x86_64.ll @@ -1307,16 +1307,16 @@ ; BTVER2-LABEL: test_shld_shrd_16: ; BTVER2: # BB#0: ; BTVER2-NEXT: #APP -; BTVER2-NEXT: shldw %cl, %si, %di # sched: [1:0.50] -; BTVER2-NEXT: shldw %cl, %si, (%rdx) # sched: [4:1.00] -; BTVER2-NEXT: shldw $7, %si, %di # sched: [1:0.50] -; BTVER2-NEXT: shldw $7, %si, (%rdx) # sched: [4:1.00] +; BTVER2-NEXT: shldw %cl, %si, %di # sched: [4:4.00] +; BTVER2-NEXT: shldw %cl, %si, (%rdx) # sched: [9:11.00] +; BTVER2-NEXT: shldw $7, %si, %di # sched: [3:3.00] +; BTVER2-NEXT: shldw $7, %si, (%rdx) # sched: [9:11.00] ; BTVER2-NEXT: #NO_APP ; BTVER2-NEXT: #APP -; BTVER2-NEXT: shrdw %cl, %si, %di # sched: [1:0.50] -; BTVER2-NEXT: shrdw %cl, %si, (%rdx) # sched: [4:1.00] -; BTVER2-NEXT: shrdw $7, %si, %di # sched: [1:0.50] -; BTVER2-NEXT: shrdw $7, %si, (%rdx) # sched: [4:1.00] +; BTVER2-NEXT: shrdw %cl, %si, %di # sched: [4:4.00] +; BTVER2-NEXT: shrdw %cl, %si, (%rdx) # sched: [9:11.00] +; BTVER2-NEXT: shrdw $7, %si, %di # sched: [3:3.00] +; BTVER2-NEXT: shrdw $7, %si, (%rdx) # sched: [9:11.00] ; BTVER2-NEXT: #NO_APP ; BTVER2-NEXT: retq # sched: [4:1.00] ; @@ -1471,16 +1471,16 @@ ; BTVER2-LABEL: test_shld_shrd_32: ; BTVER2: # BB#0: ; BTVER2-NEXT: #APP -; BTVER2-NEXT: shldl %cl, %esi, %edi # sched: [1:0.50] -; BTVER2-NEXT: shldl %cl, %esi, (%rdx) # sched: [4:1.00] -; BTVER2-NEXT: shldl $7, %esi, %edi # sched: [1:0.50] -; BTVER2-NEXT: shldl $7, %esi, (%rdx) # sched: [4:1.00] +; BTVER2-NEXT: shldl %cl, %esi, %edi # sched: [4:4.00] +; BTVER2-NEXT: shldl %cl, %esi, (%rdx) # sched: [9:11.00] +; BTVER2-NEXT: shldl $7, %esi, %edi # sched: [3:3.00] +; BTVER2-NEXT: shldl $7, %esi, (%rdx) # sched: [9:11.00] ; BTVER2-NEXT: #NO_APP ; BTVER2-NEXT: #APP -; BTVER2-NEXT: shrdl %cl, %esi, %edi # sched: [1:0.50] -; BTVER2-NEXT: shrdl %cl, %esi, (%rdx) # sched: [4:1.00] -; BTVER2-NEXT: shrdl $7, %esi, %edi # sched: [1:0.50] -; BTVER2-NEXT: shrdl $7, %esi, (%rdx) # sched: [4:1.00] +; BTVER2-NEXT: shrdl %cl, %esi, %edi # sched: [4:4.00] +; BTVER2-NEXT: shrdl %cl, %esi, (%rdx) # sched: [9:11.00] +; BTVER2-NEXT: shrdl $7, %esi, %edi # sched: [3:3.00] +; BTVER2-NEXT: shrdl $7, %esi, (%rdx) # sched: [9:11.00] ; BTVER2-NEXT: #NO_APP ; BTVER2-NEXT: retq # sched: [4:1.00] ; @@ -1635,16 +1635,16 @@ ; BTVER2-LABEL: test_shld_shrd_64: ; BTVER2: # BB#0: ; BTVER2-NEXT: #APP -; BTVER2-NEXT: shldq %cl, %rsi, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shldq %cl, %rsi, (%rdx) # sched: [4:1.00] -; BTVER2-NEXT: shldq $7, %rsi, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shldq $7, %rsi, (%rdx) # sched: [4:1.00] +; BTVER2-NEXT: shldq %cl, %rsi, %rdi # sched: [4:4.00] +; BTVER2-NEXT: shldq %cl, %rsi, (%rdx) # sched: [9:11.00] +; BTVER2-NEXT: shldq $7, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: shldq $7, %rsi, (%rdx) # sched: [9:11.00] ; BTVER2-NEXT: #NO_APP ; BTVER2-NEXT: #APP -; BTVER2-NEXT: shrdq %cl, %rsi, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [4:1.00] -; BTVER2-NEXT: shrdq $7, %rsi, %rdi # sched: [1:0.50] -; BTVER2-NEXT: shrdq $7, %rsi, (%rdx) # sched: [4:1.00] +; BTVER2-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:4.00] +; BTVER2-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [9:11.00] +; BTVER2-NEXT: shrdq $7, %rsi, %rdi # sched: [3:3.00] +; BTVER2-NEXT: shrdq $7, %rsi, (%rdx) # sched: [9:11.00] ; BTVER2-NEXT: #NO_APP ; BTVER2-NEXT: retq # sched: [4:1.00] ;