Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -65,6 +65,8 @@ def JFPM : ProcResource<1>; // FP multiplication def JFPA : ProcResource<1>; // FP addition +def JVALU01 : ProcResGroup<[JVALU0, JVALU1]>; + // Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 // cycles after the memory operand. def : ReadAdvance; @@ -320,6 +322,40 @@ } //////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +def WritePHorizontal: SchedWriteRes<[JVALU01]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : InstRW<[WritePHorizontal], (instregex "(V)?PH(ADD|SUB)(D|W|SW)rr")>; + +def WritePHorizontalLd: SchedWriteRes<[JVALU01]> { + let Latency = 6; + let ResourceCycles = [1]; +} +def : InstRW<[WritePHorizontalLd], (instregex "(V)?PH(ADD|SUB)(D|W|SW)rm")>; + +def WriteHorizontal: SchedWriteRes<[JFPA]> { + let Latency = 3; + let ResourceCycles = [1]; +} +def : InstRW<[WriteHorizontal], (instregex "(V)?H(ADD|SUB)P(S|D)rr")>; + +def WriteHorizontalY: SchedWriteRes<[JFPA]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : InstRW<[WriteHorizontalY], (instregex "VH(ADD|SUB)P(S|D)Yrr")>; + +def WriteHorizontalYLd: SchedWriteRes<[JFPA]> { + let Latency = 8; + let ResourceCycles = [2]; +} +def : InstRW<[WriteHorizontalYLd], (instregex "VH(ADD|SUB)P(S|D)Yrm")>; + +//////////////////////////////////////////////////////////////////////////////// // Carry-less multiplication instructions. //////////////////////////////////////////////////////////////////////////////// Index: test/CodeGen/X86/avx-schedule.ll =================================================================== --- test/CodeGen/X86/avx-schedule.ll +++ test/CodeGen/X86/avx-schedule.ll @@ -910,14 +910,14 @@ ; ; BTVER2-LABEL: test_haddpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_haddpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -941,14 +941,14 @@ ; ; BTVER2-LABEL: test_haddps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_haddps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -972,14 +972,14 @@ ; ; BTVER2-LABEL: test_hsubpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_hsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -1003,14 +1003,14 @@ ; ; BTVER2-LABEL: test_hsubps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_hsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 Index: test/CodeGen/X86/ssse3-schedule.ll =================================================================== --- test/CodeGen/X86/ssse3-schedule.ll +++ test/CodeGen/X86/ssse3-schedule.ll @@ -236,7 +236,7 @@ ; BTVER2-LABEL: test_phaddd: ; BTVER2: # BB#0: ; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -287,7 +287,7 @@ ; BTVER2-LABEL: test_phaddsw: ; BTVER2: # BB#0: ; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -330,7 +330,7 @@ ; BTVER2-LABEL: test_phaddw: ; BTVER2: # BB#0: ; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -373,7 +373,7 @@ ; BTVER2-LABEL: test_phsubd: ; BTVER2: # BB#0: ; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -424,7 +424,7 @@ ; BTVER2-LABEL: test_phsubsw: ; BTVER2: # BB#0: ; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -467,7 +467,7 @@ ; BTVER2-LABEL: test_phsubw: ; BTVER2: # BB#0: ; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16