Index: llvm/trunk/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.td +++ llvm/trunk/lib/Target/X86/X86InstrInfo.td @@ -2378,30 +2378,35 @@ multiclass bmi_bextr_bzhi opc, string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, - PatFrag ld_frag> { + PatFrag ld_frag, X86FoldableSchedWrite Sched> { def rr : I, - T8PS, VEX, Sched<[WriteALU]>; + T8PS, VEX, Sched<[Sched]>; def rm : I, T8PS, VEX, - Sched<[WriteALULd, ReadAfterLd]>; + Sched<[Sched.Folded, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src2 + ReadAfterLd]>; } let Predicates = [HasBMI], Defs = [EFLAGS] in { defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem, - int_x86_bmi_bextr_32, loadi32>; + int_x86_bmi_bextr_32, loadi32, WriteBEXTR>; defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem, - int_x86_bmi_bextr_64, loadi64>, VEX_W; + int_x86_bmi_bextr_64, loadi64, WriteBEXTR>, VEX_W; } let Predicates = [HasBMI2], Defs = [EFLAGS] in { defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem, - int_x86_bmi_bzhi_32, loadi32>; + int_x86_bmi_bzhi_32, loadi32, WriteBZHI>; defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem, - int_x86_bmi_bzhi_64, loadi64>, VEX_W; + int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W; } def CountTrailingOnes : SDNodeXForm; +// BMI1 BEXTR, BMI2 BZHI +defm : BWWriteResPair; +defm : BWWriteResPair; + // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } def : WriteRes; @@ -492,7 +496,6 @@ "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r", "MMX_PABSBrr", "MMX_PABSDrr", @@ -780,8 +783,7 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup19], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[BWWriteResGroup19], (instregex "BSWAP(16|32|64)r")>; def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> { let Latency = 2; @@ -1442,7 +1444,6 @@ "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MMX_PABSBrm", "MMX_PABSDrm", "MMX_PABSWrm", @@ -1833,13 +1834,6 @@ def: InstRW<[BWWriteResGroup84], (instregex "LRETQ", "RETQ")>; -def BWWriteResGroup85 : SchedWriteRes<[BWPort23,BWPort06,BWPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[BWWriteResGroup85], (instregex "BEXTR(32|64)rm")>; - def BWWriteResGroup86 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; Index: llvm/trunk/lib/Target/X86/X86SchedHaswell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedHaswell.td +++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td @@ -128,6 +128,10 @@ defm : HWWriteResPair; defm : HWWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +defm : HWWriteResPair; +defm : HWWriteResPair; + // This is quite rough, latency depends on the dividend. defm : HWWriteResPair; // Scalar and vector floating point. @@ -844,7 +848,6 @@ "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r", "MMX_PABSBrr", "MMX_PABSDrr", @@ -1230,7 +1233,6 @@ "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MMX_PABSBrm", "MMX_PABSDrm", "MMX_PABSWrm", @@ -1606,8 +1608,7 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup34], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>; def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> { let Latency = 2; @@ -1711,13 +1712,6 @@ "RETL", "RETQ")>; -def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[HWWriteResGroup42], (instregex "BEXTR(32|64)rm")>; - def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; Index: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td +++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td @@ -119,6 +119,11 @@ defm : SBWriteResPair; defm : SBWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +// NOTE: These don't exist on Sandy Bridge. Ports are guesses. +defm : SBWriteResPair; +defm : SBWriteResPair; + // Scalar and vector floating point. def : WriteRes; def : WriteRes { let Latency = 6; } Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td @@ -120,6 +120,10 @@ // Integer shifts and rotates. defm : SKLWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +defm : SKLWriteResPair; +defm : SKLWriteResPair; + // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } def : WriteRes; @@ -558,7 +562,6 @@ "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r")>; def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> { @@ -802,8 +805,7 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup22], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup22], (instregex "BSWAP(16|32|64)r")>; def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> { let Latency = 2; @@ -1464,7 +1466,6 @@ "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MOVBE(16|32|64)rm")>; def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> { @@ -1806,13 +1807,6 @@ def: InstRW<[SKLWriteResGroup98], (instregex "LRETQ", "RETQ")>; -def SKLWriteResGroup99 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SKLWriteResGroup99], (instregex "BEXTR(32|64)rm")>; - def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { let Latency = 7; let NumMicroOps = 5; Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td @@ -120,6 +120,10 @@ defm : SKXWriteResPair; defm : SKXWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +defm : SKXWriteResPair; +defm : SKXWriteResPair; + // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } def : WriteRes; @@ -1034,7 +1038,6 @@ "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r")>; def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> { @@ -1597,8 +1600,7 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup22], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[SKXWriteResGroup22], (instregex "BSWAP(16|32|64)r")>; def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> { let Latency = 2; @@ -3094,7 +3096,6 @@ "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MOVBE(16|32|64)rm")>; def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> { @@ -3753,13 +3754,6 @@ def: InstRW<[SKXWriteResGroup104], (instregex "LRETQ", "RETQ")>; -def SKXWriteResGroup105 : SchedWriteRes<[SKXPort23,SKXPort06,SKXPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SKXWriteResGroup105], (instregex "BEXTR(32|64)rm")>; - def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { let Latency = 7; let NumMicroOps = 4; Index: llvm/trunk/lib/Target/X86/X86Schedule.td =================================================================== --- llvm/trunk/lib/Target/X86/X86Schedule.td +++ llvm/trunk/lib/Target/X86/X86Schedule.td @@ -54,6 +54,10 @@ // Integer shifts and rotates. defm WriteShift : X86SchedWritePair; +// BMI1 BEXTR, BMI2 BZHI +defm WriteBEXTR : X86SchedWritePair; +defm WriteBZHI : X86SchedWritePair; + // Loads, stores, and moves, not folded with other operations. def WriteLoad : SchedWrite; def WriteStore : SchedWrite; Index: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td +++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td @@ -141,6 +141,10 @@ defm : JWriteResIntPair; defm : JWriteResIntPair; +// BMI1 BEXTR, BMI2 BZHI +defm : JWriteResIntPair; +defm : JWriteResIntPair; // NOTE: Doesn't exist on Jaguar. + def JWriteIMul64 : SchedWriteRes<[JALU1, JMul]> { let Latency = 6; let ResourceCycles = [1, 4]; Index: llvm/trunk/lib/Target/X86/X86ScheduleSLM.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleSLM.td +++ llvm/trunk/lib/Target/X86/X86ScheduleSLM.td @@ -104,6 +104,11 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +// NOTE: These don't exist on Silvermont. Ports are guesses. +defm : SBWriteResPair; +defm : SBWriteResPair; + // This is quite rough, latency depends on the dividend. defm : SLMWriteResPair; Index: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td +++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td @@ -162,6 +162,10 @@ // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; +// BMI1 BEXTR, BMI2 BZHI +defm : ZnWriteResPair; +defm : ZnWriteResPair; + // IDIV def : WriteRes { let Latency = 41; @@ -564,25 +568,13 @@ // r,m. def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "BLS(I|MSK|R)(32|64)rm")>; -// BEXTR. -// r,r,r. -def : InstRW<[WriteALU], (instregex "BEXTR(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BEXTR(32|64)rm")>; - -// BZHI. -// r,r,r. -def : InstRW<[WriteALU], (instregex "BZHI(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BZHI(32|64)rm")>; - // CLD STD. def : InstRW<[WriteALU], (instregex "STD", "CLD")>; // PDEP PEXT. // r,r,r. def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; -// r,m,r. +// r,r,m. def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; // ROR ROL. Index: llvm/trunk/test/CodeGen/X86/bmi-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bmi-schedule.ll +++ llvm/trunk/test/CodeGen/X86/bmi-schedule.ll @@ -172,8 +172,8 @@ define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) { ; GENERIC-LABEL: test_bextr_i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx # sched: [5:0.50] -; GENERIC-NEXT: bextrl %edi, %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:1.00] +; GENERIC-NEXT: bextrl %edi, %esi, %eax # sched: [2:1.00] ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -222,8 +222,8 @@ define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) { ; GENERIC-LABEL: test_bextr_i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [5:0.50] -; GENERIC-NEXT: bextrq %rdi, %rsi, %rax # sched: [1:0.33] +; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:1.00] +; GENERIC-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:1.00] ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; Index: llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll +++ llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll @@ -9,8 +9,8 @@ define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) { ; GENERIC-LABEL: test_bzhi_i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:0.50] -; GENERIC-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:1.00] +; GENERIC-NEXT: bzhil %edi, %esi, %eax # sched: [1:1.00] ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -59,8 +59,8 @@ define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) { ; GENERIC-LABEL: test_bzhi_i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:0.50] -; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.33] +; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:1.00] +; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:1.00] ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ;