Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -371,6 +371,38 @@ def : WriteRes; //////////////////////////////////////////////////////////////////////////////// +// SSE4.1 instructions. +//////////////////////////////////////////////////////////////////////////////// + +def WriteDPPS: SchedWriteRes<[JFPU0, JFPU1]> { + let Latency = 11; + let ResourceCycles = [3,3]; + let NumMicroOps = 5; +} +def : InstRW<[WriteDPPS], (instregex "(V)?DPPSrri")>; + +def WriteDPPSLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> { + let Latency = 16; + let ResourceCycles = [1,3,3]; + let NumMicroOps = 6; +} +def : InstRW<[WriteDPPSLd], (instregex "(V)?DPPSrmi")>; + +def WriteDPPD: SchedWriteRes<[JFPU0, JFPU1]> { + let Latency = 9; + let ResourceCycles = [3,3]; + let NumMicroOps = 3; +} +def : InstRW<[WriteDPPD], (instregex "(V)?DPPDrri")>; + +def WriteDPPDLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> { + let Latency = 14; + let ResourceCycles = [1,3,3]; + let NumMicroOps = 3; +} +def : InstRW<[WriteDPPDLd], (instregex "(V)?DPPDrmi")>; + +//////////////////////////////////////////////////////////////////////////////// // SSE4A instructions. //////////////////////////////////////////////////////////////////////////////// @@ -390,6 +422,20 @@ // AVX instructions. //////////////////////////////////////////////////////////////////////////////// +def WriteVDPPSY: SchedWriteRes<[JFPU1, JFPU0]> { + let Latency = 12; + let ResourceCycles = [6, 6]; + let NumMicroOps = 10; +} +def : InstRW<[WriteVDPPSY], (instregex "VDPPSYrr")>; + +def WriteVDPPSYLd: SchedWriteRes<[JLAGU, JFPU1, JFPU0]> { + let Latency = 17; + let ResourceCycles = [1, 6, 6]; + let NumMicroOps = 11; +} +def : InstRW<[WriteVDPPSYLd, ReadAfterLd], (instregex "VDPPSYrm")>; + def WriteFAddY: SchedWriteRes<[JFPU0]> { let Latency = 3; let ResourceCycles = [2]; Index: test/CodeGen/X86/avx-schedule.ll =================================================================== --- test/CodeGen/X86/avx-schedule.ll +++ test/CodeGen/X86/avx-schedule.ll @@ -1509,8 +1509,8 @@ ; ; BTVER2-LABEL: test_dpps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:6.00] +; BTVER2-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [17:6.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_dpps: Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -320,8 +320,8 @@ ; ; BTVER2-LABEL: test_dppd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:3.00] +; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [14:3.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_dppd: @@ -381,8 +381,8 @@ ; ; BTVER2-LABEL: test_dpps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [11:3.00] +; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [16:3.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_dpps: @@ -397,68 +397,6 @@ } declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone -define i32 @test_extractps(<4 x float> %a0, i32 *%a1) { -; GENERIC-LABEL: test_extractps: -; GENERIC: # BB#0: -; GENERIC-NEXT: extractps $3, %xmm0, %eax # sched: [3:1.00] -; GENERIC-NEXT: extractps $1, %xmm0, (%rdi) # sched: [5:1.00] -; GENERIC-NEXT: retq # sched: [1:1.00] -; -; SLM-LABEL: test_extractps: -; SLM: # BB#0: -; SLM-NEXT: extractps $3, %xmm0, %eax # sched: [1:1.00] -; SLM-NEXT: extractps $1, %xmm0, (%rdi) # sched: [4:2.00] -; SLM-NEXT: retq # sched: [4:1.00] -; -; SANDY-LABEL: test_extractps: -; SANDY: # BB#0: -; SANDY-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00] -; SANDY-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [1:1.00] -; -; HASWELL-LABEL: test_extractps: -; HASWELL: # BB#0: -; HASWELL-NEXT: vextractps $3, %xmm0, %eax # sched: [2:1.00] -; HASWELL-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [2:1.00] -; -; BROADWELL-LABEL: test_extractps: -; BROADWELL: # BB#0: -; BROADWELL-NEXT: vextractps $3, %xmm0, %eax # sched: [2:1.00] -; BROADWELL-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [1:1.00] -; BROADWELL-NEXT: retq # sched: [2:1.00] -; -; SKYLAKE-LABEL: test_extractps: -; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00] -; SKYLAKE-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00] -; SKYLAKE-NEXT: retq # sched: [7:1.00] -; -; SKX-LABEL: test_extractps: -; SKX: # BB#0: -; SKX-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00] -; SKX-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00] -; SKX-NEXT: retq # sched: [7:1.00] -; -; BTVER2-LABEL: test_extractps: -; BTVER2: # BB#0: -; BTVER2-NEXT: vextractps $3, %xmm0, %eax # sched: [1:0.50] -; BTVER2-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [6:1.00] -; BTVER2-NEXT: retq # sched: [4:1.00] -; -; ZNVER1-LABEL: test_extractps: -; ZNVER1: # BB#0: -; ZNVER1-NEXT: vextractps $3, %xmm0, %eax # sched: [2:2.00] -; ZNVER1-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [5:2.50] -; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = extractelement <4 x float> %a0, i32 3 - %2 = extractelement <4 x float> %a0, i32 1 - %3 = bitcast float %1 to i32 - %4 = bitcast float %2 to i32 - store i32 %4, i32 *%a1 - ret i32 %3 -} - define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) { ; GENERIC-LABEL: test_insertps: ; GENERIC: # BB#0: @@ -949,71 +887,61 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) { ; GENERIC-LABEL: test_pextrd: ; GENERIC: # BB#0: -; GENERIC-NEXT: paddd %xmm0, %xmm0 # sched: [1:0.50] -; GENERIC-NEXT: pextrd $3, %xmm0, %eax # sched: [3:1.00] -; GENERIC-NEXT: pextrd $1, %xmm0, (%rdi) # sched: [5:1.00] +; GENERIC-NEXT: extractps $3, %xmm0, %eax # sched: [3:1.00] +; GENERIC-NEXT: extractps $1, %xmm0, (%rdi) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SLM-LABEL: test_pextrd: ; SLM: # BB#0: -; SLM-NEXT: paddd %xmm0, %xmm0 # sched: [1:0.50] -; SLM-NEXT: pextrd $3, %xmm0, %eax # sched: [1:1.00] -; SLM-NEXT: pextrd $1, %xmm0, (%rdi) # sched: [4:2.00] +; SLM-NEXT: extractps $3, %xmm0, %eax # sched: [1:1.00] +; SLM-NEXT: extractps $1, %xmm0, (%rdi) # sched: [4:2.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_pextrd: ; SANDY: # BB#0: -; SANDY-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00] -; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00] +; SANDY-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [5:1.00] ; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pextrd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00] -; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: vextractps $3, %xmm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [1:1.00] ; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BROADWELL-LABEL: test_pextrd: ; BROADWELL: # BB#0: -; BROADWELL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50] -; BROADWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00] -; BROADWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00] +; BROADWELL-NEXT: vextractps $3, %xmm0, %eax # sched: [2:1.00] +; BROADWELL-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [1:1.00] ; BROADWELL-NEXT: retq # sched: [2:1.00] ; ; SKYLAKE-LABEL: test_pextrd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; SKYLAKE-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00] -; SKYLAKE-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00] +; SKYLAKE-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00] +; SKYLAKE-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00] ; SKYLAKE-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: test_pextrd: ; SKX: # BB#0: -; SKX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; SKX-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00] -; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00] +; SKX-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00] +; SKX-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00] ; SKX-NEXT: retq # sched: [7:1.00] ; ; BTVER2-LABEL: test_pextrd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50] -; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50] -; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00] +; BTVER2-NEXT: vextractps $3, %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_pextrd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.25] -; ZNVER1-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.25] -; ZNVER1-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [1:0.50] - %1 = add <4 x i32> %a0, %a0 - %2 = extractelement <4 x i32> %1, i32 3 - %3 = extractelement <4 x i32> %1, i32 1 - store i32 %3, i32 *%a1 - ret i32 %2 +; ZNVER1-NEXT: vextractps $3, %xmm0, %eax # sched: [2:2.00] +; ZNVER1-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [5:2.50] +; ZNVER1-NEXT: retq # sched: [1:0.50] + %1 = extractelement <4 x i32> %a0, i32 3 + %2 = extractelement <4 x i32> %a0, i32 1 + store i32 %2, i32 *%a1 + ret i32 %1 } define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {