Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -649,12 +649,12 @@ let hasSideEffects = 0 in def rr : PI, - Sched<[WriteFShuffle]>; + Sched<[WriteVecMove]>; let canFoldAsLoad = 1, isReMaterializable = 1 in def rm : PI, - Sched<[WriteLoad]>; + Sched<[WriteVecLoad]>; } let Predicates = [HasAVX, NoVLX] in { @@ -702,7 +702,7 @@ PD; } -let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { +let SchedRW = [WriteVecStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -801,7 +801,7 @@ def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>; -let SchedRW = [WriteStore] in { +let SchedRW = [WriteVecStore] in { def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -3385,7 +3385,7 @@ //===----------------------------------------------------------------------===// let AddedComplexity = 400 in { // Prefer non-temporal versions -let SchedRW = [WriteStore] in { +let SchedRW = [WriteVecStore] in { let Predicates = [HasAVX, NoVLX] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), @@ -3455,7 +3455,7 @@ [(nontemporalstore (i64 GR64:$src), addr:$dst)], IIC_SSE_MOVNT>, PS, Requires<[HasSSE2]>; -} // SchedRW = [WriteStore] +} // SchedRW = [WriteVecStore] let Predicates = [HasAVX, NoVLX] in { def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), @@ -3560,7 +3560,7 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions -let hasSideEffects = 0, SchedRW = [WriteMove] in { +let hasSideEffects = 0, SchedRW = [WriteVecMove] in { def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG; @@ -3577,7 +3577,7 @@ // For Disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - SchedRW = [WriteMove] in { + SchedRW = [WriteVecMove] in { def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, @@ -3597,7 +3597,7 @@ } let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - hasSideEffects = 0, SchedRW = [WriteLoad], Predicates = [HasAVX,NoVLX] in { + hasSideEffects = 0, SchedRW = [WriteVecLoad], Predicates = [HasAVX,NoVLX] in { def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (alignedloadv2i64 addr:$src))], @@ -3614,7 +3614,7 @@ XS, VEX, VEX_L, VEX_WIG; } -let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore], +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteVecStore], Predicates = [HasAVX,NoVLX] in { def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), @@ -3634,7 +3634,7 @@ XS, VEX, VEX_L, VEX_WIG; } -let SchedRW = [WriteMove] in { +let SchedRW = [WriteVecMove] in { let hasSideEffects = 0 in { def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -3658,7 +3658,7 @@ } // SchedRW let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - hasSideEffects = 0, SchedRW = [WriteLoad] in { + hasSideEffects = 0, SchedRW = [WriteVecLoad] in { def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], @@ -3670,7 +3670,7 @@ XS, Requires<[UseSSE2]>; } -let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteVecStore] in { def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], @@ -4302,7 +4302,7 @@ // SSE2 - Conditional Store //===---------------------------------------------------------------------===// -let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), @@ -4826,7 +4826,7 @@ // SSE3 - Move Unaligned Integer //===---------------------------------------------------------------------===// -let SchedRW = [WriteLoad] in { +let SchedRW = [WriteVecLoad] in { let Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", @@ -6832,7 +6832,7 @@ } let AddedComplexity = 400 in { // Prefer non-temporal versions -let SchedRW = [WriteLoad] in { +let SchedRW = [WriteVecLoad] in { let Predicates = [HasAVX, NoVLX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", []>, @@ -7615,7 +7615,7 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), (ins f128mem:$dst, VR256:$src1, u8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, Sched<[WriteStore]>, VEX, VEX_L; + []>, Sched<[WriteVecStore]>, VEX, VEX_L; } multiclass vextract_lowering { @@ -7652,22 +7652,22 @@ (ins VR128:$src1, f128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))], - IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteVecLoad]>; def Yrm : AVX8I, VEX_4V, VEX_L, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteVecLoad]>; def mr : AVX8I, - VEX_4V, Sched<[WriteStore]>; + VEX_4V, Sched<[WriteVecStore]>; def Ymr : AVX8I, - VEX_4V, VEX_L, Sched<[WriteStore]>; + VEX_4V, VEX_L, Sched<[WriteVecStore]>; } let ExeDomain = SSEPackedSingle in @@ -8270,7 +8270,7 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, u8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - Sched<[WriteStore]>, VEX, VEX_L; + Sched<[WriteVecStore]>, VEX, VEX_L; let Predicates = [HasAVX2, NoVLX] in { defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; @@ -8289,22 +8289,22 @@ (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))], - IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteVecLoad]>; def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))], - IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteVecLoad]>; def mr : AVX28I<0x8e, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>, - VEX_4V, Sched<[WriteStore]>; + VEX_4V, Sched<[WriteVecStore]>; def Ymr : AVX28I<0x8e, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>, - VEX_4V, VEX_L, Sched<[WriteStore]>; + VEX_4V, VEX_L, Sched<[WriteVecStore]>; } defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", Index: lib/Target/X86/X86SchedBroadwell.td =================================================================== --- lib/Target/X86/X86SchedBroadwell.td +++ lib/Target/X86/X86SchedBroadwell.td @@ -116,6 +116,11 @@ def : WriteRes; def : WriteRes; +// Vector loads, stores, and moves, not folded with other operations. +def : WriteRes { let Latency = 5; } +def : WriteRes; +def : WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -103,6 +103,10 @@ def : WriteRes; def : WriteRes; +def : WriteRes; +def : WriteRes { let Latency = 5; } +def : WriteRes; + defm : HWWriteResPair; defm : HWWriteResPair; def : WriteRes { let Latency = 3; } Index: lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- lib/Target/X86/X86SchedSandyBridge.td +++ lib/Target/X86/X86SchedSandyBridge.td @@ -92,6 +92,10 @@ def : WriteRes; def : WriteRes; +def : WriteRes; +def : WriteRes { let Latency = 6; } +def : WriteRes; + defm : SBWriteResPair; defm : SBWriteResPair; def : WriteRes { let Latency = 3; } Index: lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- lib/Target/X86/X86SchedSkylakeClient.td +++ lib/Target/X86/X86SchedSkylakeClient.td @@ -117,6 +117,11 @@ def : WriteRes; def : WriteRes; +// Vector loads, stores, and moves, not folded with other operations. +def : WriteRes { let Latency = 6; } +def : WriteRes; +def : WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; Index: lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- lib/Target/X86/X86SchedSkylakeServer.td +++ lib/Target/X86/X86SchedSkylakeServer.td @@ -117,6 +117,10 @@ def : WriteRes; def : WriteRes; +def : WriteRes { let Latency = 5; } +def : WriteRes; +def : WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -54,6 +54,11 @@ def WriteStore : SchedWrite; def WriteMove : SchedWrite; +// Vector integer/float loads, stores, and moves. +def WriteVecLoad : SchedWrite; +def WriteVecStore : SchedWrite; +def WriteVecMove : SchedWrite; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def WriteZero : SchedWrite; Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -208,13 +208,16 @@ //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. -// FIXME: Split x86 and SSE load/store/moves //////////////////////////////////////////////////////////////////////////////// def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; +def : WriteRes { let Latency = 5; } +def : WriteRes; +def : WriteRes; + // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; Index: lib/Target/X86/X86ScheduleSLM.td =================================================================== --- lib/Target/X86/X86ScheduleSLM.td +++ lib/Target/X86/X86ScheduleSLM.td @@ -77,6 +77,10 @@ def : WriteRes; def : WriteRes; +def : WriteRes; +def : WriteRes { let Latency = 3; } +def : WriteRes; + // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; Index: lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- lib/Target/X86/X86ScheduleZnver1.td +++ lib/Target/X86/X86ScheduleZnver1.td @@ -134,6 +134,10 @@ def : WriteRes; def : WriteRes { let Latency = 8; } +def : WriteRes; +def : WriteRes; +def : WriteRes { let Latency = 8; } + def : WriteRes; def : WriteRes; defm : ZnWriteResPair; Index: test/CodeGen/X86/avx-schedule.ll =================================================================== --- test/CodeGen/X86/avx-schedule.ll +++ test/CodeGen/X86/avx-schedule.ll @@ -2103,7 +2103,7 @@ ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:0.50] ; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:0.50] -; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1) call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) @@ -2166,7 +2166,7 @@ ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:1.00] ; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00] -; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1) call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2) @@ -2229,7 +2229,7 @@ ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:0.50] ; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:0.50] -; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1) call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) @@ -2292,7 +2292,7 @@ ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:1.00] ; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00] -; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1) call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2) Index: test/CodeGen/X86/avx2-schedule.ll =================================================================== --- test/CodeGen/X86/avx2-schedule.ll +++ test/CodeGen/X86/avx2-schedule.ll @@ -573,7 +573,7 @@ define <4 x i64> @test_movntdqa(i8* %a0) { ; GENERIC-LABEL: test_movntdqa: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [4:0.50] +; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntdqa: @@ -3380,7 +3380,7 @@ define <4 x i32> @test_pmaskmovd(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) { ; GENERIC-LABEL: test_pmaskmovd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3429,7 +3429,7 @@ define <8 x i32> @test_pmaskmovd_ymm(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) { ; GENERIC-LABEL: test_pmaskmovd_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3478,7 +3478,7 @@ define <2 x i64> @test_pmaskmovq(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) { ; GENERIC-LABEL: test_pmaskmovq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3527,7 +3527,7 @@ define <4 x i64> @test_pmaskmovq_ymm(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) { ; GENERIC-LABEL: test_pmaskmovq_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] Index: test/CodeGen/X86/sha-schedule.ll =================================================================== --- test/CodeGen/X86/sha-schedule.ll +++ test/CodeGen/X86/sha-schedule.ll @@ -210,11 +210,11 @@ ; ; GOLDMONT-LABEL: test_sha256rnds2: ; GOLDMONT: # %bb.0: -; GOLDMONT-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00] -; GOLDMONT-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; GOLDMONT-NEXT: movaps %xmm0, %xmm3 # sched: [1:0.50] +; GOLDMONT-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; GOLDMONT-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00] ; GOLDMONT-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [7:1.00] -; GOLDMONT-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00] +; GOLDMONT-NEXT: movaps %xmm3, %xmm0 # sched: [1:0.50] ; GOLDMONT-NEXT: retq # sched: [4:1.00] ; ; CANNONLAKE-LABEL: test_sha256rnds2: @@ -228,11 +228,11 @@ ; ; ZNVER1-LABEL: test_sha256rnds2: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.50] -; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.25] +; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00] ; ZNVER1-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [11:1.00] -; ZNVER1-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = load <4 x i32>, <4 x i32>* %a3 %2 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) Index: test/CodeGen/X86/sse-schedule.ll =================================================================== --- test/CodeGen/X86/sse-schedule.ll +++ test/CodeGen/X86/sse-schedule.ll @@ -2557,7 +2557,7 @@ ; SLM-NEXT: rcpps (%rdi), %xmm1 # sched: [8:1.00] ; SLM-NEXT: rcpps %xmm0, %xmm0 # sched: [5:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_rcpps: @@ -2729,7 +2729,7 @@ ; SLM-NEXT: rsqrtps (%rdi), %xmm1 # sched: [8:1.00] ; SLM-NEXT: rsqrtps %xmm0, %xmm0 # sched: [5:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_rsqrtps: @@ -3038,7 +3038,7 @@ ; SLM-NEXT: sqrtps (%rdi), %xmm1 # sched: [18:1.00] ; SLM-NEXT: sqrtps %xmm0, %xmm0 # sched: [15:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_sqrtps: Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -3564,7 +3564,7 @@ ; SLM-LABEL: test_movsd_reg: ; SLM: # %bb.0: ; SLM-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movsd_reg: @@ -8756,7 +8756,7 @@ ; SLM-NEXT: sqrtpd (%rdi), %xmm1 # sched: [18:1.00] ; SLM-NEXT: sqrtpd %xmm0, %xmm0 # sched: [15:1.00] ; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_sqrtpd: @@ -9284,10 +9284,10 @@ ; SLM-LABEL: test_unpcklpd: ; SLM: # %bb.0: ; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50] ; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] ; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_unpcklpd: Index: test/CodeGen/X86/sse3-schedule.ll =================================================================== --- test/CodeGen/X86/sse3-schedule.ll +++ test/CodeGen/X86/sse3-schedule.ll @@ -566,7 +566,7 @@ ; SLM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] sched: [4:1.00] ; SLM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] ; SLM-NEXT: subpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movddup: @@ -645,7 +645,7 @@ ; SLM-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:1.00] ; SLM-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movshdup: @@ -724,7 +724,7 @@ ; SLM-NEXT: movsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:1.00] ; SLM-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movsldup: Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -163,11 +163,11 @@ ; ; SLM-LABEL: test_blendvpd: ; SLM: # %bb.0: -; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00] -; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:0.50] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; SLM-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [1:1.00] ; SLM-NEXT: blendvpd %xmm0, (%rdi), %xmm3 # sched: [4:1.00] -; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_blendvpd: @@ -230,11 +230,11 @@ ; ; SLM-LABEL: test_blendvps: ; SLM: # %bb.0: -; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00] -; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:0.50] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; SLM-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [1:1.00] ; SLM-NEXT: blendvps %xmm0, (%rdi), %xmm3 # sched: [4:1.00] -; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_blendvps: @@ -717,7 +717,7 @@ ; SLM-LABEL: test_pblendvb: ; SLM: # %bb.0: ; SLM-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.50] -; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; SLM-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [1:1.00] ; SLM-NEXT: pblendvb %xmm0, (%rdi), %xmm3 # sched: [4:1.00] ; SLM-NEXT: movdqa %xmm3, %xmm0 # sched: [1:0.50] @@ -2991,7 +2991,7 @@ ; SLM-NEXT: roundpd $7, (%rdi), %xmm1 # sched: [6:1.00] ; SLM-NEXT: roundpd $7, %xmm0, %xmm0 # sched: [3:1.00] ; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_roundpd: @@ -3063,7 +3063,7 @@ ; SLM-NEXT: roundps $7, (%rdi), %xmm1 # sched: [6:1.00] ; SLM-NEXT: roundps $7, %xmm0, %xmm0 # sched: [3:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_roundps: @@ -3133,7 +3133,7 @@ ; ; SLM-LABEL: test_roundsd: ; SLM: # %bb.0: -; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50] ; SLM-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [6:1.00] ; SLM-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00] ; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] @@ -3206,7 +3206,7 @@ ; ; SLM-LABEL: test_roundss: ; SLM: # %bb.0: -; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:0.50] ; SLM-NEXT: roundss $7, (%rdi), %xmm0 # sched: [6:1.00] ; SLM-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00] ; SLM-NEXT: addps %xmm2, %xmm0 # sched: [3:1.00]