Index: lib/Target/X86/X86ScheduleBdVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBdVer2.td +++ lib/Target/X86/X86ScheduleBdVer2.td @@ -250,7 +250,10 @@ def : ReadAdvance; def : ReadAdvance; -def : ReadAdvance; +// Transfer from int domain to ivec domain incurs additional latency of 8..10cy +// Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller +// and Excavator pipeline", "Data delay between different execution domains" +def : ReadAdvance; // A folded store needs a cycle on the PdStore for the store data. def : WriteRes; Index: test/CodeGen/X86/mmx-schedule.ll =================================================================== --- test/CodeGen/X86/mmx-schedule.ll +++ test/CodeGen/X86/mmx-schedule.ll @@ -3880,8 +3880,8 @@ ; BDVER2-LABEL: test_pinsrw: ; BDVER2: # %bb.0: ; BDVER2-NEXT: movswl (%rsi), %eax # sched: [5:0.50] -; BDVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [2:0.50] -; BDVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [2:0.50] +; BDVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [12:0.50] +; BDVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [12:0.50] ; BDVER2-NEXT: movq %mm0, %rax # sched: [10:1.00] ; BDVER2-NEXT: retq # sched: [5:1.00] ; Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -10100,13 +10100,13 @@ ; ; BDVER2-SSE-LABEL: test_pinsrw: ; BDVER2-SSE: # %bb.0: -; BDVER2-SSE-NEXT: pinsrw $1, %edi, %xmm0 # sched: [2:0.50] +; BDVER2-SSE-NEXT: pinsrw $1, %edi, %xmm0 # sched: [12:0.50] ; BDVER2-SSE-NEXT: pinsrw $3, (%rsi), %xmm0 # sched: [6:0.50] ; BDVER2-SSE-NEXT: retq # sched: [5:1.00] ; ; BDVER2-LABEL: test_pinsrw: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:0.50] +; BDVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [12:0.50] ; BDVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50] ; BDVER2-NEXT: retq # sched: [5:1.00] ; Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -2425,13 +2425,13 @@ ; ; BDVER2-SSE-LABEL: test_pinsrb: ; BDVER2-SSE: # %bb.0: -; BDVER2-SSE-NEXT: pinsrb $1, %edi, %xmm0 # sched: [2:0.50] +; BDVER2-SSE-NEXT: pinsrb $1, %edi, %xmm0 # sched: [12:0.50] ; BDVER2-SSE-NEXT: pinsrb $3, (%rsi), %xmm0 # sched: [6:0.50] ; BDVER2-SSE-NEXT: retq # sched: [5:1.00] ; ; BDVER2-LABEL: test_pinsrb: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:0.50] +; BDVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [12:0.50] ; BDVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50] ; BDVER2-NEXT: retq # sched: [5:1.00] ; @@ -2539,13 +2539,13 @@ ; ; BDVER2-SSE-LABEL: test_pinsrd: ; BDVER2-SSE: # %bb.0: -; BDVER2-SSE-NEXT: pinsrd $1, %edi, %xmm0 # sched: [2:0.50] +; BDVER2-SSE-NEXT: pinsrd $1, %edi, %xmm0 # sched: [12:0.50] ; BDVER2-SSE-NEXT: pinsrd $3, (%rsi), %xmm0 # sched: [6:0.50] ; BDVER2-SSE-NEXT: retq # sched: [5:1.00] ; ; BDVER2-LABEL: test_pinsrd: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:0.50] +; BDVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [12:0.50] ; BDVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50] ; BDVER2-NEXT: retq # sched: [5:1.00] ; @@ -2666,14 +2666,14 @@ ; BDVER2-SSE-LABEL: test_pinsrq: ; BDVER2-SSE: # %bb.0: ; BDVER2-SSE-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [6:0.50] -; BDVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [2:0.50] +; BDVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [12:0.50] ; BDVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [2:0.50] ; BDVER2-SSE-NEXT: retq # sched: [5:1.00] ; ; BDVER2-LABEL: test_pinsrq: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:0.50] -; BDVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:0.50] +; BDVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [12:0.50] ; BDVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50] ; BDVER2-NEXT: retq # sched: [5:1.00] ; Index: test/tools/llvm-mca/X86/BdVer2/int-to-fpu-forwarding-1.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/int-to-fpu-forwarding-1.s +++ test/tools/llvm-mca/X86/BdVer2/int-to-fpu-forwarding-1.s @@ -42,8 +42,8 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 2 0.50 vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2 2 0.50 vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -100,8 +100,8 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 2 0.50 vpinsrw $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2 2 0.50 vpinsrw $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrw $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrw $1, %eax, %xmm0, %xmm0 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -158,8 +158,8 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 2 0.50 vpinsrd $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2 2 0.50 vpinsrd $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrd $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrd $1, %eax, %xmm0, %xmm0 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -216,8 +216,8 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 2 0.50 vpinsrq $0, %rax, %xmm0, %xmm0 -# CHECK-NEXT: 2 2 0.50 vpinsrq $1, %rax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrq $0, %rax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrq $1, %rax, %xmm0, %xmm0 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 Index: test/tools/llvm-mca/X86/BdVer2/int-to-fpu-forwarding-3.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/int-to-fpu-forwarding-3.s +++ test/tools/llvm-mca/X86/BdVer2/int-to-fpu-forwarding-3.s @@ -7,12 +7,12 @@ # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1500 -# CHECK-NEXT: Total Cycles: 2004 +# CHECK-NEXT: Total Cycles: 2014 # CHECK-NEXT: Total uOps: 2500 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 1.25 -# CHECK-NEXT: IPC: 0.75 +# CHECK-NEXT: uOps Per Cycle: 1.24 +# CHECK-NEXT: IPC: 0.74 # CHECK-NEXT: Block RThroughput: 1.3 # CHECK: Instruction Info: @@ -25,8 +25,8 @@ # CHECK: [1] [2] [3] [4] [5] [6] Instructions: # CHECK-NEXT: 1 1 0.50 addl %eax, %eax -# CHECK-NEXT: 2 2 0.50 vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2 2 0.50 vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2 12 0.50 vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Resources: # CHECK-NEXT: [0.0] - PdAGLU01 @@ -64,18 +64,18 @@ # CHECK-NEXT: - - - - - - - - - - 1.00 - - - 1.00 - - - - - - - - vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 012345 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345 -# CHECK: [0,0] DeER . . . addl %eax, %eax -# CHECK-NEXT: [0,1] D=eeER . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [0,2] .D==eeER . . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,0] .DeE---R . . addl %eax, %eax -# CHECK-NEXT: [1,1] . D===eeER. . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,2] . D=====eeER . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,0] . DeE-----R . addl %eax, %eax -# CHECK-NEXT: [2,1] . D======eeER . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,2] . D=======eeER vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK: [0,0] DeER . . . . . addl %eax, %eax +# CHECK-NEXT: [0,1] D===========eeER . . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [0,2] .D============eeER . . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,0] .DeE-------------R . . addl %eax, %eax +# CHECK-NEXT: [1,1] . D=============eeER. . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,2] . D===============eeER . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,0] . DeE---------------R . addl %eax, %eax +# CHECK-NEXT: [2,1] . D================eeER . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,2] . D=================eeER vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -84,6 +84,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.0 0.7 2.7 addl %eax, %eax -# CHECK-NEXT: 1. 3 4.3 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2. 3 5.7 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 0. 3 1.0 0.7 9.3 addl %eax, %eax +# CHECK-NEXT: 1. 3 14.3 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2. 3 15.7 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 Index: test/tools/llvm-mca/X86/BdVer2/resources-avx1.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/resources-avx1.s +++ test/tools/llvm-mca/X86/BdVer2/resources-avx1.s @@ -1469,13 +1469,13 @@ # CHECK-NEXT: 4 10 0.50 * vphsubsw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 3 5 0.50 vphsubw %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 4 10 0.50 * vphsubw (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 2 2 0.50 vpinsrb $1, %eax, %xmm1, %xmm2 +# CHECK-NEXT: 2 12 0.50 vpinsrb $1, %eax, %xmm1, %xmm2 # CHECK-NEXT: 2 6 0.50 * vpinsrb $1, (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 2 2 0.50 vpinsrd $1, %eax, %xmm1, %xmm2 +# CHECK-NEXT: 2 12 0.50 vpinsrd $1, %eax, %xmm1, %xmm2 # CHECK-NEXT: 2 6 0.50 * vpinsrd $1, (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 2 2 0.50 vpinsrq $1, %rax, %xmm1, %xmm2 +# CHECK-NEXT: 2 12 0.50 vpinsrq $1, %rax, %xmm1, %xmm2 # CHECK-NEXT: 2 6 0.50 * vpinsrq $1, (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 2 2 0.50 vpinsrw $1, %eax, %xmm1, %xmm2 +# CHECK-NEXT: 2 12 0.50 vpinsrw $1, %eax, %xmm1, %xmm2 # CHECK-NEXT: 2 6 0.50 * vpinsrw $1, (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 4 1.00 vpmaddubsw %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 9 1.00 * vpmaddubsw (%rax), %xmm1, %xmm2 Index: test/tools/llvm-mca/X86/BdVer2/resources-sse1.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/resources-sse1.s +++ test/tools/llvm-mca/X86/BdVer2/resources-sse1.s @@ -269,7 +269,7 @@ # CHECK-NEXT: 1 2 0.50 pavgw %mm0, %mm2 # CHECK-NEXT: 1 7 0.50 * pavgw (%rax), %mm2 # CHECK-NEXT: 2 13 1.00 pextrw $1, %mm0, %ecx -# CHECK-NEXT: 2 2 0.50 pinsrw $1, %eax, %mm2 +# CHECK-NEXT: 2 12 0.50 pinsrw $1, %eax, %mm2 # CHECK-NEXT: 2 6 0.50 * pinsrw $1, (%rax), %mm2 # CHECK-NEXT: 1 2 0.50 pmaxsw %mm0, %mm2 # CHECK-NEXT: 1 7 0.50 * pmaxsw (%rax), %mm2 Index: test/tools/llvm-mca/X86/BdVer2/resources-sse2.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/resources-sse2.s +++ test/tools/llvm-mca/X86/BdVer2/resources-sse2.s @@ -561,7 +561,7 @@ # CHECK-NEXT: 1 2 0.50 pcmpgtw %xmm0, %xmm2 # CHECK-NEXT: 1 7 0.50 * pcmpgtw (%rax), %xmm2 # CHECK-NEXT: 2 13 1.00 pextrw $1, %xmm0, %ecx -# CHECK-NEXT: 2 2 0.50 pinsrw $1, %eax, %xmm0 +# CHECK-NEXT: 2 12 0.50 pinsrw $1, %eax, %xmm0 # CHECK-NEXT: 2 6 0.50 * pinsrw $1, (%rax), %xmm0 # CHECK-NEXT: 1 4 1.00 pmaddwd %xmm0, %xmm2 # CHECK-NEXT: 1 9 1.00 * pmaddwd (%rax), %xmm2 Index: test/tools/llvm-mca/X86/BdVer2/resources-sse41.s =================================================================== --- test/tools/llvm-mca/X86/BdVer2/resources-sse41.s +++ test/tools/llvm-mca/X86/BdVer2/resources-sse41.s @@ -191,11 +191,11 @@ # CHECK-NEXT: 2 13 1.00 * pextrw $1, %xmm0, (%rax) # CHECK-NEXT: 2 4 1.00 phminposuw %xmm0, %xmm2 # CHECK-NEXT: 2 9 1.00 * phminposuw (%rax), %xmm2 -# CHECK-NEXT: 2 2 0.50 pinsrb $1, %eax, %xmm1 +# CHECK-NEXT: 2 12 0.50 pinsrb $1, %eax, %xmm1 # CHECK-NEXT: 2 6 0.50 * pinsrb $1, (%rax), %xmm1 -# CHECK-NEXT: 2 2 0.50 pinsrd $1, %eax, %xmm1 +# CHECK-NEXT: 2 12 0.50 pinsrd $1, %eax, %xmm1 # CHECK-NEXT: 2 6 0.50 * pinsrd $1, (%rax), %xmm1 -# CHECK-NEXT: 2 2 0.50 pinsrq $1, %rax, %xmm1 +# CHECK-NEXT: 2 12 0.50 pinsrq $1, %rax, %xmm1 # CHECK-NEXT: 2 6 0.50 * pinsrq $1, (%rax), %xmm1 # CHECK-NEXT: 1 2 0.50 pmaxsb %xmm0, %xmm2 # CHECK-NEXT: 1 7 0.50 * pmaxsb (%rax), %xmm2