Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -1085,14 +1085,14 @@ (ins VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, - EVEX, VEX_WIG, Sched<[WriteFBlend]>; + EVEX, VEX_WIG, Sched<[WriteVecExtract]>; def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs), (ins f32mem:$dst, VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, - EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteFBlendLd, WriteRMW]>; + EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecExtractSt]>; //===---------------------------------------------------------------------===// // AVX-512 BROADCAST @@ -9878,7 +9878,7 @@ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))), addr:$dst)]>, - EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd, WriteRMW]>; + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>; } multiclass avx512_extract_elt_b { @@ -9888,7 +9888,7 @@ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>, - EVEX, TAPD, Sched<[WriteShuffle]>; + EVEX, TAPD, Sched<[WriteVecExtract]>; defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; } @@ -9901,14 +9901,14 @@ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>, - EVEX, PD, Sched<[WriteShuffle]>; + EVEX, PD, Sched<[WriteVecExtract]>; let hasSideEffects = 0 in def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX, TAPD, FoldGenData, - Sched<[WriteShuffle]>; + Sched<[WriteVecExtract]>; defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; } @@ -9922,7 +9922,7 @@ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GRC:$dst, (extractelt (_.VT _.RC:$src1), imm:$src2))]>, - EVEX, TAPD, Sched<[WriteShuffle]>; + EVEX, TAPD, Sched<[WriteVecExtract]>; def mr : AVX512Ii8<0x16, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), @@ -9930,7 +9930,7 @@ [(store (extractelt (_.VT _.RC:$src1), imm:$src2),addr:$dst)]>, EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD, - Sched<[WriteShuffleLd, WriteRMW]>; + Sched<[WriteVecExtractSt]>; } } @@ -9946,7 +9946,7 @@ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd, ReadAfterLd]>; + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsertLd, ReadAfterLd]>; } multiclass avx512_insert_elt_bw opc, string OpcodeStr, SDNode OpNode, @@ -9957,7 +9957,7 @@ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V, - Sched<[WriteShuffle]>; + Sched<[WriteVecInsert]>; defm NAME : avx512_insert_elt_m; } @@ -9971,7 +9971,7 @@ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>, - EVEX_4V, TAPD, Sched<[WriteShuffle]>; + EVEX_4V, TAPD, Sched<[WriteVecInsert]>; defm NAME : avx512_insert_elt_m, TAPD; Index: llvm/trunk/lib/Target/X86/X86InstrMMX.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrMMX.td +++ llvm/trunk/lib/Target/X86/X86InstrMMX.td @@ -528,7 +528,7 @@ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, imm:$src2))]>, - Sched<[WriteShuffle]>; + Sched<[WriteVecExtract]>; let Constraints = "$src1 = $dst" in { let Predicates = [HasSSE1] in { def MMX_PINSRWrr : MMXIi8<0xC4, MRMSrcReg, @@ -537,7 +537,7 @@ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteShuffle]>; + Sched<[WriteVecInsert]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), @@ -546,7 +546,7 @@ [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, (i32 (anyext (loadi16 addr:$src2))), imm:$src3))]>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + Sched<[WriteVecInsertLd, ReadAfterLd]>; } } Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -3782,7 +3782,7 @@ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteShuffle]>; + Sched<[WriteVecInsert]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, u8imm:$src3), @@ -3792,7 +3792,7 @@ [(set VR128:$dst, (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), imm:$src3))]>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + Sched<[WriteVecInsertLd, ReadAfterLd]>; } // Extract @@ -3802,13 +3802,13 @@ "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))]>, - PD, VEX, Sched<[WriteShuffle]>; + PD, VEX, Sched<[WriteVecExtract]>; def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))]>, - Sched<[WriteShuffle]>; + Sched<[WriteVecExtract]>; // Insert let Predicates = [HasAVX, NoBWI] in @@ -5085,15 +5085,14 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, - Sched<[WriteShuffle]>; - let hasSideEffects = 0, mayStore = 1, - SchedRW = [WriteShuffleLd, WriteRMW] in + Sched<[WriteVecExtract]>; + let hasSideEffects = 0, mayStore = 1 in def mr : SS4AIi8; + addr:$dst)]>, Sched<[WriteVecExtractSt]>; } let Predicates = [HasAVX, NoBWI] in @@ -5109,16 +5108,15 @@ (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - Sched<[WriteShuffle]>, FoldGenData; + Sched<[WriteVecExtract]>, FoldGenData; - let hasSideEffects = 0, mayStore = 1, - SchedRW = [WriteShuffleLd, WriteRMW] in + let hasSideEffects = 0, mayStore = 1 in def mr : SS4AIi8; + addr:$dst)]>, Sched<[WriteVecExtractSt]>; } let Predicates = [HasAVX, NoBWI] in @@ -5135,14 +5133,13 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32:$dst, (extractelt (v4i32 VR128:$src1), imm:$src2))]>, - Sched<[WriteShuffle]>; - let SchedRW = [WriteShuffleLd, WriteRMW] in + Sched<[WriteVecExtract]>; def mr : SS4AIi8; + addr:$dst)]>, Sched<[WriteVecExtractSt]>; } let Predicates = [HasAVX, NoDQI] in @@ -5158,14 +5155,13 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR64:$dst, (extractelt (v2i64 VR128:$src1), imm:$src2))]>, - Sched<[WriteShuffle]>; - let SchedRW = [WriteShuffleLd, WriteRMW] in + Sched<[WriteVecExtract]>; def mr : SS4AIi8; + addr:$dst)]>, Sched<[WriteVecExtractSt]>; } let Predicates = [HasAVX, NoDQI] in @@ -5182,14 +5178,13 @@ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, - Sched<[WriteFBlend]>; - let SchedRW = [WriteFBlendLd, WriteRMW] in + Sched<[WriteVecExtract]>; def mr : SS4AIi8; + addr:$dst)]>, Sched<[WriteVecExtractSt]>; } let ExeDomain = SSEPackedSingle in { @@ -5223,7 +5218,7 @@ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteShuffle]>; + Sched<[WriteVecInsert]>; def rm : SS4AIi8, Sched<[WriteShuffleLd, ReadAfterLd]>; + imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; } let Predicates = [HasAVX, NoBWI] in @@ -5249,7 +5244,7 @@ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, - Sched<[WriteShuffle]>; + Sched<[WriteVecInsert]>; def rm : SS4AIi8, Sched<[WriteShuffleLd, ReadAfterLd]>; + imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; } let Predicates = [HasAVX, NoDQI] in @@ -5275,7 +5270,7 @@ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, - Sched<[WriteShuffle]>; + Sched<[WriteVecInsert]>; def rm : SS4AIi8, Sched<[WriteShuffleLd, ReadAfterLd]>; + imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; } let Predicates = [HasAVX, NoDQI] in Index: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td +++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// def BroadwellModel : SchedMachineModel { - // All x86 instructions are modeled as a single micro-op, and HW can decode 4 + // All x86 instructions are modeled as a single micro-op, and BW can decode 4 // instructions per cycle. let IssueWidth = 4; let MicroOpBufferSize = 192; // Based on the reorder buffer. @@ -190,6 +190,26 @@ defm : BWWriteResPair; // Vector MPSAD. defm : BWWriteResPair; // Vector PSADBW. +// Vector insert/extract operations. +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes { + let Latency = 6; + let NumMicroOps = 2; +} + +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} +def : WriteRes { + let Latency = 2; + let NumMicroOps = 3; +} + // Conversion between integer and float. defm : BWWriteResPair; // Float -> Integer. defm : BWWriteResPair; // Integer -> Float. @@ -462,17 +482,6 @@ "(V?)MOVUPD(Y?)mr", "(V?)MOVUPS(Y?)mr")>; -def BWWriteResGroup11 : SchedWriteRes<[BWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[BWWriteResGroup11], (instregex "MMX_PINSRWrr", - "(V?)PINSRBrr", - "(V?)PINSRDrr", - "(V?)PINSRQrr", - "(V?)PINSRWrr")>; - def BWWriteResGroup12 : SchedWriteRes<[BWPort01]> { let Latency = 2; let NumMicroOps = 2; @@ -505,15 +514,9 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup15], (instregex "MMX_PEXTRWrr", - "VCVTPH2PS(Y?)rr", +def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PS(Y?)rr", "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr", - "(V?)EXTRACTPSrr", - "(V?)PEXTRBrr", - "(V?)PEXTRDrr", - "(V?)PEXTRQrr", - "(V?)PEXTRWrr", "(V?)PSLLDrr", "(V?)PSLLQrr", "(V?)PSLLWrr", @@ -573,17 +576,6 @@ "SBB8ri", "SET(A|BE)r")>; -def BWWriteResGroup21 : SchedWriteRes<[BWPort4,BWPort5,BWPort237]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[BWWriteResGroup21], (instregex "(V?)EXTRACTPSmr", - "(V?)PEXTRBmr", - "(V?)PEXTRDmr", - "(V?)PEXTRQmr", - "(V?)PEXTRWmr")>; - def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> { let Latency = 2; let NumMicroOps = 3; Index: llvm/trunk/lib/Target/X86/X86SchedHaswell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedHaswell.td +++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td @@ -189,6 +189,26 @@ defm : HWWriteResPair; defm : HWWriteResPair; +// Vector insert/extract operations. +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes { + let Latency = 6; + let NumMicroOps = 2; +} + +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} +def : WriteRes { + let Latency = 2; + let NumMicroOps = 3; +} + // String instructions. // Packed Compare Implicit Length Strings, Return Mask @@ -1092,17 +1112,6 @@ } def: InstRW<[HWWriteResGroup19], (instregex "SFENCE")>; -def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[HWWriteResGroup20], (instregex "(V?)EXTRACTPSmr", - "(V?)PEXTRBmr", - "(V?)PEXTRDmr", - "(V?)PEXTRQmr", - "(V?)PEXTRWmr")>; - def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> { let Latency = 2; let NumMicroOps = 3; @@ -1160,17 +1169,6 @@ def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm", "PUSH(16|32|64)rmm")>; -def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWrr", - "(V?)PINSRBrr", - "(V?)PINSRDrr", - "(V?)PINSRQrr", - "(V?)PINSRWrr")>; - def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> { let Latency = 2; let NumMicroOps = 2; @@ -1203,16 +1201,10 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup31], (instregex "MMX_PEXTRWrr", - "VCVTPH2PSYrr", +def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr", "VCVTPH2PSrr", "(V?)CVTPS2PDrr", "(V?)CVTSS2SDrr", - "(V?)EXTRACTPSrr", - "(V?)PEXTRBrr", - "(V?)PEXTRDrr", - "(V?)PEXTRQrr", - "(V?)PEXTRWrr", "(V?)PSLLDrr", "(V?)PSLLQrr", "(V?)PSLLWrr", Index: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td +++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td @@ -173,6 +173,25 @@ defm : SBWriteResPair; defm : SBWriteResPair; +// Vector insert/extract operations. +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} +def : WriteRes { + let Latency = 7; + let NumMicroOps = 2; +} + +def : WriteRes { + let Latency = 3; + let NumMicroOps = 2; +} +def : WriteRes { + let Latency = 5; + let NumMicroOps = 3; +} + //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// @@ -535,16 +554,6 @@ } def: InstRW<[SBWriteResGroup16_1], (instrs BSWAP32r)>; -def SBWriteResGroup17 : SchedWriteRes<[SBPort5,SBPort15]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup17], (instregex "(V?)PINSRBrr", - "(V?)PINSRDrr", - "(V?)PINSRQrr", - "(V?)PINSRWrr")>; - def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> { let Latency = 2; let NumMicroOps = 2; @@ -590,16 +599,6 @@ } def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>; -def SBWriteResGroup23 : SchedWriteRes<[SBPort0,SBPort15]> { - let Latency = 3; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup23], (instregex "(V?)PEXTRBrr", - "(V?)PEXTRDrr", - "(V?)PEXTRQrr", - "(V?)PEXTRWrr")>; - def SBWriteResGroup23_2 : SchedWriteRes<[SBPort05]> { let Latency = 3; let NumMicroOps = 3; @@ -793,15 +792,6 @@ def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPD(Y?)mr", "VMASKMOVPS(Y?)mr")>; -def SBWriteResGroup39 : SchedWriteRes<[SBPort4,SBPort23,SBPort15]> { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SBWriteResGroup39], (instregex "(V?)PEXTRBmr", - "VPEXTRDmr", - "VPEXTRWmr")>; - def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { let Latency = 5; let NumMicroOps = 3; @@ -1009,10 +999,6 @@ "(V?)PCMPGTBrm", "(V?)PCMPGTDrm", "(V?)PCMPGTWrm", - "(V?)PINSRBrm", - "(V?)PINSRDrm", - "(V?)PINSRQrm", - "(V?)PINSRWrm", "(V?)PMAXSBrm", "(V?)PMAXSDrm", "(V?)PMAXSWrm", Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td @@ -187,6 +187,26 @@ defm : SKLWriteResPair; // Vector MPSAD. defm : SKLWriteResPair; // Vector PSADBW. +// Vector insert/extract operations. +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes { + let Latency = 6; + let NumMicroOps = 2; +} + +def : WriteRes { + let Latency = 3; + let NumMicroOps = 2; +} +def : WriteRes { + let Latency = 2; + let NumMicroOps = 3; +} + // Conversion between integer and float. defm : SKLWriteResPair; // Float -> Integer. defm : SKLWriteResPair; // Integer -> Float. @@ -571,12 +591,7 @@ let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MOVQ2DQrr", - "MMX_PINSRWrr", - "(V?)PINSRBrr", - "(V?)PINSRDrr", - "(V?)PINSRQrr", - "(V?)PINSRWrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>; def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> { let Latency = 2; @@ -671,17 +686,6 @@ "SBB8i8", "SBB8ri")>; -def SKLWriteResGroup24 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SKLWriteResGroup24], (instregex "(V?)EXTRACTPSmr", - "(V?)PEXTRBmr", - "(V?)PEXTRDmr", - "(V?)PEXTRQmr", - "(V?)PEXTRWmr")>; - def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> { let Latency = 2; let NumMicroOps = 3; @@ -761,13 +765,7 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup31], (instregex "MMX_PEXTRWrr", - "(V?)EXTRACTPSrr", - "(V?)PEXTRBrr", - "(V?)PEXTRDrr", - "(V?)PEXTRQrr", - "(V?)PEXTRWrr", - "(V?)PTEST(Y?)rr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "(V?)PTEST(Y?)rr")>; def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> { let Latency = 3; Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td @@ -187,6 +187,26 @@ defm : SKXWriteResPair; // Vector MPSAD. defm : SKXWriteResPair; // Vector PSADBW. +// Vector insert/extract operations. +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes { + let Latency = 6; + let NumMicroOps = 2; +} + +def : WriteRes { + let Latency = 3; + let NumMicroOps = 2; +} +def : WriteRes { + let Latency = 2; + let NumMicroOps = 3; +} + // Conversion between integer and float. defm : SKXWriteResPair; // Float -> Integer. defm : SKXWriteResPair; // Integer -> Float. @@ -1035,20 +1055,7 @@ let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[SKXWriteResGroup13], (instregex "MMX_MOVQ2DQrr", - "MMX_PINSRWrr", - "PINSRBrr", - "PINSRDrr", - "PINSRQrr", - "PINSRWrr", - "VPINSRBZrr", - "VPINSRBrr", - "VPINSRDZrr", - "VPINSRDrr", - "VPINSRQZrr", - "VPINSRQrr", - "VPINSRWZrr", - "VPINSRWrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>; def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> { let Latency = 2; @@ -1163,27 +1170,6 @@ "SBB8i8", "SBB8ri")>; -def SKXWriteResGroup24 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SKXWriteResGroup24], (instregex "EXTRACTPSmr", - "PEXTRBmr", - "PEXTRDmr", - "PEXTRQmr", - "PEXTRWmr", - "VEXTRACTPSZmr(b?)", - "VEXTRACTPSmr", - "VPEXTRBZmr(b?)", - "VPEXTRBmr", - "VPEXTRDZmr(b?)", - "VPEXTRDmr", - "VPEXTRQZmr(b?)", - "VPEXTRQmr", - "VPEXTRWZmr(b?)", - "VPEXTRWmr")>; - def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> { let Latency = 2; let NumMicroOps = 3; @@ -1455,25 +1441,7 @@ let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup33], (instregex "EXTRACTPSrr", - "MMX_PEXTRWrr", - "PEXTRBrr", - "PEXTRDrr", - "PEXTRQrr", - "PEXTRWrr", - "PTESTrr", - "VEXTRACTPSZrr", - "VEXTRACTPSrr", - "VPEXTRBZrr", - "VPEXTRBrr", - "VPEXTRDZrr", - "VPEXTRDrr", - "VPEXTRQZrr", - "VPEXTRQrr", - "VPEXTRWZrr", - "VPEXTRWrr", - "VPTESTYrr", - "VPTESTrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "(V?)PTEST(Y?)rr")>; def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> { let Latency = 3; Index: llvm/trunk/lib/Target/X86/X86Schedule.td =================================================================== --- llvm/trunk/lib/Target/X86/X86Schedule.td +++ llvm/trunk/lib/Target/X86/X86Schedule.td @@ -117,6 +117,11 @@ defm WritePSADBW : X86SchedWritePair; // Vector PSADBW. defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD. +// Vector insert/extract operations. +defm WriteVecInsert : X86SchedWritePair; // Insert gpr to vector element. +def WriteVecExtract : SchedWrite; // Extract vector element to gpr. +def WriteVecExtractSt : SchedWrite; // Extract vector element and store. + // MOVMSK operations. def WriteFMOVMSK : SchedWrite; def WriteVecMOVMSK : SchedWrite; Index: llvm/trunk/lib/Target/X86/X86ScheduleAtom.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleAtom.td +++ llvm/trunk/lib/Target/X86/X86ScheduleAtom.td @@ -252,6 +252,14 @@ defm : AtomWriteResPair; // NOTE: Doesn't exist on Atom. //////////////////////////////////////////////////////////////////////////////// +// Vector insert/extract operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : AtomWriteResPair; +def : WriteRes; +def : WriteRes; + +//////////////////////////////////////////////////////////////////////////////// // SSE42 String instructions. //////////////////////////////////////////////////////////////////////////////// Index: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td +++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td @@ -385,23 +385,12 @@ defm : JWriteResFpuPair; // NOTE: Doesn't exist on Jaguar. //////////////////////////////////////////////////////////////////////////////// -// Vector Extraction instructions. +// Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -def JWritePEXTR : SchedWriteRes<[JFPU0, JFPA, JALU0]> { let Latency = 3; } -def : InstRW<[JWritePEXTR], (instrs MMX_PEXTRWrr, - EXTRACTPSrr, VEXTRACTPSrr, - PEXTRBrr, VPEXTRBrr, - PEXTRDrr, VPEXTRDrr, - PEXTRQrr, VPEXTRQrr, - PEXTRWrr, VPEXTRWrr, PEXTRWrr_REV, VPEXTRWrr_REV)>; - -def JWritePEXTRSt : SchedWriteRes<[JFPU1, JSTC, JSAGU]> { let Latency = 3; } -def : InstRW<[JWritePEXTRSt], (instrs EXTRACTPSmr, VEXTRACTPSmr, - PEXTRBmr, VPEXTRBmr, - PEXTRDmr, VPEXTRDmr, - PEXTRQmr, VPEXTRQmr, - PEXTRWmr, VPEXTRWmr)>; +defm : JWriteResFpuPair; +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } //////////////////////////////////////////////////////////////////////////////// // SSE42 String instructions. Index: llvm/trunk/lib/Target/X86/X86ScheduleSLM.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleSLM.td +++ llvm/trunk/lib/Target/X86/X86ScheduleSLM.td @@ -164,6 +164,16 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; +// Vector insert/extract operations. +defm : SLMWriteResPair; + +def : WriteRes; +def : WriteRes { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} + //////////////////////////////////////////////////////////////////////////////// // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// Index: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td +++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td @@ -233,6 +233,19 @@ // Vector Shift Operations defm : ZnWriteResFpuPair; +// Vector insert/extract operations. +defm : ZnWriteResFpuPair; + +def : WriteRes { + let Latency = 2; + let ResourceCycles = [1, 2]; +} +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 2, 3]; +} + // MOVMSK Instructions. def : WriteRes; def : WriteRes; @@ -987,22 +1000,6 @@ } def : InstRW<[ZnWritePMOVMSKBY], (instregex "(V|MMX_)?PMOVMSKBYrr")>; -// PEXTR B/W/D/Q. -// r32,x,i. -def ZnWritePEXTRr : SchedWriteRes<[ZnFPU12, ZnFPU2]> { - let Latency = 2; - let ResourceCycles = [1, 2]; -} -def : InstRW<[ZnWritePEXTRr], (instregex "(V?)PEXTR(B|W|D|Q)rr", "MMX_PEXTRWrr")>; - -def ZnWritePEXTRm : SchedWriteRes<[ZnAGU, ZnFPU12, ZnFPU2]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 2, 3]; -} -// m8,x,i. -def : InstRW<[ZnWritePEXTRm], (instregex "(V?)PEXTR(B|W|D|Q)mr")>; - // VPBROADCAST B/W. // x, m8/16. def ZnWriteVPBROADCAST128Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> { Index: llvm/trunk/test/CodeGen/X86/mmx-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/mmx-schedule.ll +++ llvm/trunk/test/CodeGen/X86/mmx-schedule.ll @@ -2978,7 +2978,7 @@ define i32 @test_pextrw(x86_mmx %a0) optsize { ; GENERIC-LABEL: test_pextrw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: pextrw $0, %mm0, %eax # sched: [1:1.00] +; GENERIC-NEXT: pextrw $0, %mm0, %eax # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; ATOM-LABEL: test_pextrw: @@ -2993,7 +2993,7 @@ ; ; SANDY-LABEL: test_pextrw: ; SANDY: # %bb.0: -; SANDY-NEXT: pextrw $0, %mm0, %eax # sched: [1:1.00] +; SANDY-NEXT: pextrw $0, %mm0, %eax # sched: [3:1.00] ; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pextrw: @@ -3501,9 +3501,9 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize { ; GENERIC-LABEL: test_pinsrw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:1.00] +; GENERIC-NEXT: pinsrw $0, %edi, %mm0 # sched: [2:1.00] ; GENERIC-NEXT: movswl (%rsi), %eax # sched: [5:0.50] -; GENERIC-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:1.00] +; GENERIC-NEXT: pinsrw $1, %eax, %mm0 # sched: [2:1.00] ; GENERIC-NEXT: movq %mm0, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3525,9 +3525,9 @@ ; ; SANDY-LABEL: test_pinsrw: ; SANDY: # %bb.0: -; SANDY-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:1.00] +; SANDY-NEXT: pinsrw $0, %edi, %mm0 # sched: [2:1.00] ; SANDY-NEXT: movswl (%rsi), %eax # sched: [5:0.50] -; SANDY-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:1.00] +; SANDY-NEXT: pinsrw $1, %eax, %mm0 # sched: [2:1.00] ; SANDY-NEXT: movq %mm0, %rax # sched: [1:0.33] ; SANDY-NEXT: retq # sched: [1:1.00] ; Index: llvm/trunk/test/CodeGen/X86/sse41-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse41-schedule.ll @@ -1903,7 +1903,7 @@ ; GENERIC-LABEL: test_pextrw: ; GENERIC: # %bb.0: ; GENERIC-NEXT: pextrw $3, %xmm0, %eax # sched: [3:1.00] -; GENERIC-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [6:1.00] +; GENERIC-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [5:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SLM-LABEL: test_pextrw: @@ -1915,7 +1915,7 @@ ; SANDY-SSE-LABEL: test_pextrw: ; SANDY-SSE: # %bb.0: ; SANDY-SSE-NEXT: pextrw $3, %xmm0, %eax # sched: [3:1.00] -; SANDY-SSE-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [6:1.00] +; SANDY-SSE-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [5:1.00] ; SANDY-SSE-NEXT: retq # sched: [1:1.00] ; ; SANDY-LABEL: test_pextrw: Index: llvm/trunk/test/tools/llvm-mca/X86/SandyBridge/resources-sse1.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/SandyBridge/resources-sse1.s +++ llvm/trunk/test/tools/llvm-mca/X86/SandyBridge/resources-sse1.s @@ -268,9 +268,9 @@ # CHECK-NEXT: 2 8 1.00 * pavgb (%rax), %mm2 # CHECK-NEXT: 1 3 1.00 pavgw %mm0, %mm2 # CHECK-NEXT: 2 8 1.00 * pavgw (%rax), %mm2 -# CHECK-NEXT: 1 1 1.00 pextrw $1, %mm0, %ecx -# CHECK-NEXT: 1 1 1.00 pinsrw $1, %eax, %mm2 -# CHECK-NEXT: 2 6 1.00 * pinsrw $1, (%rax), %mm2 +# CHECK-NEXT: 2 3 1.00 pextrw $1, %mm0, %ecx +# CHECK-NEXT: 2 2 1.00 pinsrw $1, %eax, %mm2 +# CHECK-NEXT: 2 7 0.50 * pinsrw $1, (%rax), %mm2 # CHECK-NEXT: 1 3 1.00 pmaxsw %mm0, %mm2 # CHECK-NEXT: 2 8 1.00 * pmaxsw (%rax), %mm2 # CHECK-NEXT: 1 3 1.00 pmaxub %mm0, %mm2 @@ -331,7 +331,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] -# CHECK-NEXT: - 112.00 40.00 54.00 10.00 35.00 33.50 33.50 +# CHECK-NEXT: - 112.00 41.00 55.50 10.00 34.50 33.50 33.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: @@ -409,9 +409,9 @@ # CHECK-NEXT: - - - 1.00 - - 0.50 0.50 pavgb (%rax), %mm2 # CHECK-NEXT: - - - 1.00 - - - - pavgw %mm0, %mm2 # CHECK-NEXT: - - - 1.00 - - 0.50 0.50 pavgw (%rax), %mm2 -# CHECK-NEXT: - - - - - 1.00 - - pextrw $1, %mm0, %ecx -# CHECK-NEXT: - - - - - 1.00 - - pinsrw $1, %eax, %mm2 -# CHECK-NEXT: - - - - - 1.00 0.50 0.50 pinsrw $1, (%rax), %mm2 +# CHECK-NEXT: - - 1.00 0.50 - 0.50 - - pextrw $1, %mm0, %ecx +# CHECK-NEXT: - - - 0.50 - 1.50 - - pinsrw $1, %eax, %mm2 +# CHECK-NEXT: - - - 0.50 - 0.50 0.50 0.50 pinsrw $1, (%rax), %mm2 # CHECK-NEXT: - - - 1.00 - - - - pmaxsw %mm0, %mm2 # CHECK-NEXT: - - - 1.00 - - 0.50 0.50 pmaxsw (%rax), %mm2 # CHECK-NEXT: - - - 1.00 - - - - pmaxub %mm0, %mm2 Index: llvm/trunk/test/tools/llvm-mca/X86/SandyBridge/resources-sse41.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/SandyBridge/resources-sse41.s +++ llvm/trunk/test/tools/llvm-mca/X86/SandyBridge/resources-sse41.s @@ -188,7 +188,7 @@ # CHECK-NEXT: 4 5 1.00 * pextrd $1, %xmm0, (%rax) # CHECK-NEXT: 2 3 1.00 pextrq $1, %xmm0, %rcx # CHECK-NEXT: 4 5 1.00 * pextrq $1, %xmm0, (%rax) -# CHECK-NEXT: 3 6 1.00 * pextrw $1, %xmm0, (%rax) +# CHECK-NEXT: 3 5 1.00 * pextrw $1, %xmm0, (%rax) # CHECK-NEXT: 1 5 1.00 phminposuw %xmm0, %xmm2 # CHECK-NEXT: 2 11 1.00 * phminposuw (%rax), %xmm2 # CHECK-NEXT: 2 2 1.00 pinsrb $1, %eax, %xmm1 @@ -264,7 +264,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] -# CHECK-NEXT: - - 26.00 47.00 5.00 53.00 25.00 25.00 +# CHECK-NEXT: - - 26.00 47.50 5.00 52.50 24.50 24.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: @@ -301,7 +301,7 @@ # CHECK-NEXT: - - 1.00 0.50 1.00 0.50 0.50 0.50 pextrd $1, %xmm0, (%rax) # CHECK-NEXT: - - 1.00 0.50 - 0.50 - - pextrq $1, %xmm0, %rcx # CHECK-NEXT: - - 1.00 0.50 1.00 0.50 0.50 0.50 pextrq $1, %xmm0, (%rax) -# CHECK-NEXT: - - - - 1.00 1.00 1.00 1.00 pextrw $1, %xmm0, (%rax) +# CHECK-NEXT: - - - 0.50 1.00 0.50 0.50 0.50 pextrw $1, %xmm0, (%rax) # CHECK-NEXT: - - 1.00 - - - - - phminposuw %xmm0, %xmm2 # CHECK-NEXT: - - 1.00 - - - 0.50 0.50 phminposuw (%rax), %xmm2 # CHECK-NEXT: - - - 0.50 - 1.50 - - pinsrb $1, %eax, %xmm1