Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -5183,14 +5183,15 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, - Sched<[WriteFAdd]>; + Sched<[WriteHAdd]>; def rm : S3DI, Sched<[WriteFAddLd, ReadAfterLd]>; + IIC_SSE_HADDSUB_RM>, + Sched<[WriteHAddLd]>; } multiclass S3_Int o, string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, @@ -5200,14 +5201,40 @@ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, - Sched<[WriteFAdd]>; + Sched<[WriteHAdd]>; def rm : S3I, Sched<[WriteFAddLd, ReadAfterLd]>; + IIC_SSE_HADDSUB_RM>, Sched<[WriteHAddLd, ReadAfterLd]>; +} + +multiclass S3D_IntY o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag> { + def rr : S3DI, + Sched<[WriteHAddY]>; + + def rm : S3DI, + Sched<[WriteHAddYLd]>; +} +multiclass S3_IntY o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag> { + def rr : S3I, + Sched<[WriteHAddY]>; + + def rm : S3I, Sched<[WriteHAddYLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { @@ -5216,20 +5243,20 @@ X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG; defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG; - defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; - defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VHADDPSY : S3D_IntY<0x7C, "vhaddps", v8f32, VR256, f256mem, + X86fhadd, loadv8f32>, VEX_4V, VEX_L, VEX_WIG; + defm VHSUBPSY : S3D_IntY<0x7D, "vhsubps", v8f32, VR256, f256mem, + X86fhsub, loadv8f32>, VEX_4V, VEX_L, VEX_WIG; } let ExeDomain = SSEPackedDouble in { defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG; defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG; - defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; - defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VHADDPDY : S3_IntY <0x7C, "vhaddpd", v4f64, VR256, f256mem, + X86fhadd, loadv4f64>, VEX_4V, VEX_L, VEX_WIG; + defm VHSUBPDY : S3_IntY <0x7D, "vhsubpd", v4f64, VR256, f256mem, + X86fhsub, loadv4f64>, VEX_4V, VEX_L, VEX_WIG; } } @@ -5310,7 +5337,7 @@ // SSSE3 - Packed Binary Operator Instructions //===---------------------------------------------------------------------===// -let Sched = WriteVecALU in { +let Sched = WritePHAdd in { def SSE_PHADDSUBD : OpndItins< IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM >; Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -1486,16 +1486,48 @@ } def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>; -//-- Arithmetic instructions --// +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} + +def : WriteRes; + +// x,m / v,v,m. +def : WriteRes { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} + +def : WriteRes; // PHADD|PHSUB (S) W/D. // v <- v,v. -def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> { +def : WriteRes { let Latency = 3; let NumMicroOps = 3; let ResourceCycles = [1, 2]; } -def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64", +// v <- v,m. +def : WriteRes { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 2, 1]; +} + +//-- Arithmetic instructions --// + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : InstRW<[WritePHAdd], (instregex "MMX_PHADD(W?)rr64", "MMX_PHADDSWrr64", "MMX_PHSUB(W|D)rr64", "MMX_PHSUBSWrr64", @@ -1503,12 +1535,7 @@ "(V?)PH(ADD|SUB)SWrr(256)?")>; // v <- v,m. -def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 2, 1]; -} -def : InstRW<[WritePHADDSUBm, ReadAfterLd], +def : InstRW<[WritePHAddLd, ReadAfterLd], (instregex "MMX_PHADD(W?)rm64", "MMX_PHADDSWrm64", "MMX_PHSUB(W|D)rm64", @@ -1871,20 +1898,9 @@ // HADD, HSUB PS/PD // x,x / v,v,v. -def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} -def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>; - +def : InstRW<[WriteHAdd], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>; // x,m / v,v,m. -def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; -} -def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>; +def : InstRW<[WriteHAddLd], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>; // MULL SS/SD PS/PD. // x,x / v,v,v. Index: lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- lib/Target/X86/X86SchedSandyBridge.td +++ lib/Target/X86/X86SchedSandyBridge.td @@ -157,6 +157,47 @@ let ResourceCycles = [1, 1, 1, 1]; } +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// +// TODO: at the moment we don't model horizontal ooperations here that's why +// it's a simple place holder for future implementation +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes { + let Latency = 3; + let ResourceCycles = [1]; +} + +def : WriteRes { + let Latency = 3; + let ResourceCycles = [1]; +} + +// x,m / v,v,m. +def : WriteRes { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +def : WriteRes { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +// v <- v,m. +def : WriteRes { + let Latency = 5; + let ResourceCycles = [1, 1]; +} + // String instructions. // Packed Compare Implicit Length Strings, Return Mask def : WriteRes { Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -77,6 +77,11 @@ // FMA Scheduling helper class. class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } +// Horizontal Add/Sub (scalar and vector) +defm WriteHAdd : X86SchedWritePair; +defm WriteHAddY : X86SchedWritePair; +defm WritePHAdd : X86SchedWritePair; + // Vector integer operations. defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -65,6 +65,8 @@ def JFPM : ProcResource<1>; // FP multiplication def JFPA : ProcResource<1>; // FP addition +def JVALU01 : ProcResGroup<[JVALU0, JVALU1]>; + // Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 // cycles after the memory operand. def : ReadAdvance; @@ -320,6 +322,39 @@ } //////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes { + let Latency = 3; + let ResourceCycles = [1]; +} + +def : WriteRes { + let Latency = 8; + let ResourceCycles = [1, 1]; +} + +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} +def : WriteRes { + let Latency = 6; + let ResourceCycles = [1, 1]; +} + +def : WriteRes { + let Latency = 3; + let ResourceCycles = [2]; +} + +def : WriteRes { + let Latency = 8; + let ResourceCycles = [1, 2]; +} + +//////////////////////////////////////////////////////////////////////////////// // Carry-less multiplication instructions. //////////////////////////////////////////////////////////////////////////////// Index: lib/Target/X86/X86ScheduleSLM.td =================================================================== --- lib/Target/X86/X86ScheduleSLM.td +++ lib/Target/X86/X86ScheduleSLM.td @@ -137,6 +137,45 @@ defm : SMWriteResPair; defm : SMWriteResPair; +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// +// TODO: at the moment we don't model horizontal ooperations here that's why +// it's a simple place holder for future implementation + +// HADD, HSUB PS/PD + +def : WriteRes { + let Latency = 3; + let ResourceCycles = [2]; +} + +def : WriteRes { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : WriteRes { + let Latency = 6; + let ResourceCycles = [1, 1]; +} + +// PHADD|PHSUB (S) W/D. +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : WriteRes { + let Latency = 4; + let ResourceCycles = [1, 1]; +} + // String instructions. // Packed Compare Implicit Length Strings, Return Mask def : WriteRes { Index: test/CodeGen/X86/avx-schedule.ll =================================================================== --- test/CodeGen/X86/avx-schedule.ll +++ test/CodeGen/X86/avx-schedule.ll @@ -910,14 +910,14 @@ ; ; BTVER2-LABEL: test_haddpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_haddpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -941,14 +941,14 @@ ; ; BTVER2-LABEL: test_haddps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_haddps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -972,14 +972,14 @@ ; ; BTVER2-LABEL: test_hsubpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_hsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -1003,14 +1003,14 @@ ; ; BTVER2-LABEL: test_hsubps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_hsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32