Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -814,10 +814,8 @@ FeatureMWAITX ]>; -// TODO: The scheduler model falls to BTVER2 model. -// The znver1 model has to be put in place. -// Zen -def: ProcessorModel<"znver1", BtVer2Model, [ +// Znver1 +def: ProcessorModel<"znver1", Znver1Model, [ FeatureADX, FeatureAES, FeatureAVX2, Index: llvm/trunk/lib/Target/X86/X86Schedule.td =================================================================== --- llvm/trunk/lib/Target/X86/X86Schedule.td +++ llvm/trunk/lib/Target/X86/X86Schedule.td @@ -663,5 +663,6 @@ include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" include "X86ScheduleSLM.td" +include "X86ScheduleZnver1.td" include "X86ScheduleBtVer2.td" Index: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td +++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td @@ -0,0 +1,223 @@ +//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Znver1 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Znver1Model : SchedMachineModel { + // Zen can decode 4 instructions per cycle. + let IssueWidth = 4; + // Based on the reorder buffer we define MicroOpBufferSize + let MicroOpBufferSize = 192; + let LoadLatency = 4; + let MispredictPenalty = 17; + let HighLatency = 25; + let PostRAScheduler = 1; + + // FIXME: This variable is required for incomplete model. + // We haven't catered all instructions. + // So, we reset the value of this variable so as to + // say that the model is incomplete. + let CompleteModel = 0; +} + +let SchedModel = Znver1Model in { + +// Zen can issue micro-ops to 10 different units in one cycle. +// These are +// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3) +// * Two AGU units (ZAGU0, ZAGU1) +// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3) +// AGUs feed load store queues @two loads and 1 store per cycle. + +// Four ALU units are defined below +def ZnALU0 : ProcResource<1>; +def ZnALU1 : ProcResource<1>; +def ZnALU2 : ProcResource<1>; +def ZnALU3 : ProcResource<1>; + +// Two AGU units are defined below +def ZnAGU0 : ProcResource<1>; +def ZnAGU1 : ProcResource<1>; + +// Four FPU units are defined below +def ZnFPU0 : ProcResource<1>; +def ZnFPU1 : ProcResource<1>; +def ZnFPU2 : ProcResource<1>; +def ZnFPU3 : ProcResource<1>; + +// FPU grouping +def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>; +def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>; +def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>; +def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>; +def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>; +def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>; +def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>; +def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>; + +// Below are the grouping of the units. +// Micro-ops to be issued to multiple units are tackled this way. + +// ALU grouping +// ZnALU03 - 0,3 grouping +def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>; + +// 56 Entry (14x4 entries) Int Scheduler +def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> { + let BufferSize=56; +} + +// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations +// but are relevant for some instructions +def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> { + let BufferSize=28; +} + +// Integer Multiplication issued on ALU1. +def ZnMultiplier : ProcResource<1>; + +// Integer division issued on ALU2. +def ZnDivider : ProcResource<1>; + +// 4 Cycles load-to use Latency is captured +def : ReadAdvance; + +// (a folded load is an instruction that loads and does some operation) +// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops. +// a. load and +// b. addpd +// This multiclass is for folded loads for integer units. +multiclass ZnWriteResPair { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 4 cycles to the latency. + def : WriteRes { + let Latency = !add(Lat, 4); + } +} + +// This multiclass is for folded loads for floating point units. +multiclass ZnWriteResFpuPair { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 7 cycles to the latency. + def : WriteRes { + let Latency = !add(Lat, 7); + } +} + +// WriteRMW is set for instructions with Memory write +// operation in codegen +def : WriteRes; + +def : WriteRes; +def : WriteRes; +def : WriteRes { let Latency = 8; } + +def : WriteRes; +def : WriteRes; +defm : ZnWriteResPair; +defm : ZnWriteResPair; +defm : ZnWriteResPair; + +// IDIV +def : WriteRes { + let Latency = 41; + let ResourceCycles = [1, 41]; +} + +def : WriteRes { + let Latency = 45; + let ResourceCycles = [1, 4, 41]; +} + +// IMUL +def : WriteRes{ + let Latency = 4; +} +def : WriteRes { + let Latency = 4; +} + +def : WriteRes { + let Latency = 8; +} + +// Floating point operations +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; + +// Vector integer operations which uses FPU units +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; + +// Vector Shift Operations +defm : ZnWriteResFpuPair; + +// AES Instructions. +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; + +def : WriteRes; +def : WriteRes; + +// Following instructions with latency=100 are microcoded. +// We set long latency so as to block the entire pipeline. +defm : ZnWriteResFpuPair; + +//Microcoded Instructions +let Latency = 100 in { + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + def : WriteRes; + } +} Index: llvm/trunk/test/CodeGen/X86/avx-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-schedule.ll +++ llvm/trunk/test/CodeGen/X86/avx-schedule.ll @@ -27,9 +27,9 @@ ; ; ZNVER1-LABEL: test_addpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fadd <4 x double> %1, %2 @@ -57,9 +57,9 @@ ; ; ZNVER1-LABEL: test_addps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fadd <8 x float> %1, %2 @@ -87,9 +87,9 @@ ; ; ZNVER1-LABEL: test_addsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2) @@ -118,9 +118,9 @@ ; ; ZNVER1-LABEL: test_addsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2) @@ -152,10 +152,10 @@ ; ; ZNVER1-LABEL: test_andnotpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = xor <4 x i64> %1, @@ -193,10 +193,10 @@ ; ; ZNVER1-LABEL: test_andnotps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = xor <4 x i64> %1, @@ -234,10 +234,10 @@ ; ; ZNVER1-LABEL: test_andpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = and <4 x i64> %1, %2 @@ -273,10 +273,10 @@ ; ; ZNVER1-LABEL: test_andps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = and <4 x i64> %1, %2 @@ -313,9 +313,9 @@ ; ZNVER1-LABEL: test_blendpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fadd <4 x double> %a1, %1 @@ -345,8 +345,8 @@ ; ZNVER1-LABEL: test_blendps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] -; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -374,9 +374,9 @@ ; ; ZNVER1-LABEL: test_blendvpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) %2 = load <4 x double>, <4 x double> *%a3, align 32 %3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2) @@ -405,9 +405,9 @@ ; ; ZNVER1-LABEL: test_blendvps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) %2 = load <8 x float>, <8 x float> *%a3, align 32 %3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2) @@ -433,8 +433,8 @@ ; ; ZNVER1-LABEL: test_broadcastf128: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 32 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> ret <8 x float> %2 @@ -458,8 +458,8 @@ ; ; ZNVER1-LABEL: test_broadcastsd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load double, double *%a0, align 8 %2 = insertelement <4 x double> undef, double %1, i32 0 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer @@ -484,8 +484,8 @@ ; ; ZNVER1-LABEL: test_broadcastss: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float *%a0, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer @@ -510,8 +510,8 @@ ; ; ZNVER1-LABEL: test_broadcastss_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float *%a0, align 4 %2 = insertelement <8 x float> undef, float %1, i32 0 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer @@ -543,9 +543,9 @@ ; ZNVER1-LABEL: test_cmppd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fcmp oeq <4 x double> %a0, %2 @@ -581,9 +581,9 @@ ; ZNVER1-LABEL: test_cmpps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fcmp oeq <8 x float> %a0, %2 @@ -618,10 +618,10 @@ ; ; ZNVER1-LABEL: test_cvtdq2pd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <4 x i32> %a0 to <4 x double> %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = sitofp <4 x i32> %2 to <4 x double> @@ -655,10 +655,10 @@ ; ; ZNVER1-LABEL: test_cvtdq2ps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <8 x i32> %a0 to <8 x float> %2 = load <8 x i32>, <8 x i32> *%a1, align 16 %3 = sitofp <8 x i32> %2 to <8 x float> @@ -690,10 +690,10 @@ ; ; ZNVER1-LABEL: test_cvtpd2dq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [5:1.00] ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <4 x double> %a0 to <4 x i32> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = fptosi <4 x double> %2 to <4 x i32> @@ -725,10 +725,10 @@ ; ; ZNVER1-LABEL: test_cvtpd2ps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00] ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptrunc <4 x double> %a0 to <4 x float> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = fptrunc <4 x double> %2 to <4 x float> @@ -760,10 +760,10 @@ ; ; ZNVER1-LABEL: test_cvtps2dq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <8 x float> %a0 to <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = fptosi <8 x float> %2 to <8 x i32> @@ -792,9 +792,9 @@ ; ; ZNVER1-LABEL: test_divpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00] -; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fdiv <4 x double> %1, %2 @@ -822,9 +822,9 @@ ; ; ZNVER1-LABEL: test_divps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00] -; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fdiv <8 x float> %1, %2 @@ -853,8 +853,8 @@ ; ZNVER1-LABEL: test_dpps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7) @@ -886,9 +886,9 @@ ; ZNVER1-LABEL: test_extractf128: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:0.50] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> %2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> store <4 x float> %2, <4 x float> *%a2 @@ -916,9 +916,9 @@ ; ; ZNVER1-LABEL: test_haddpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2) @@ -947,9 +947,9 @@ ; ; ZNVER1-LABEL: test_haddps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2) @@ -978,9 +978,9 @@ ; ; ZNVER1-LABEL: test_hsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2) @@ -1009,9 +1009,9 @@ ; ; ZNVER1-LABEL: test_hsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2) @@ -1044,9 +1044,9 @@ ; ZNVER1-LABEL: test_insertf128: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] -; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> %3 = load <4 x float>, <4 x float> *%a2, align 16 @@ -1074,8 +1074,8 @@ ; ; ZNVER1-LABEL: test_lddqu: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ret <32 x i8> %1 } @@ -1108,7 +1108,7 @@ ; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1) call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) ret <2 x double> %1 @@ -1143,7 +1143,7 @@ ; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1) call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2) ret <4 x double> %1 @@ -1178,7 +1178,7 @@ ; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1) call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) ret <4 x float> %1 @@ -1213,7 +1213,7 @@ ; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1) call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2) ret <8 x float> %1 @@ -1243,8 +1243,8 @@ ; ZNVER1-LABEL: test_maxpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2) @@ -1274,8 +1274,8 @@ ; ZNVER1-LABEL: test_maxps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) @@ -1305,8 +1305,8 @@ ; ZNVER1-LABEL: test_minpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2) @@ -1336,8 +1336,8 @@ ; ZNVER1-LABEL: test_minps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) @@ -1369,10 +1369,10 @@ ; ; ZNVER1-LABEL: test_movapd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x double>, <4 x double> *%a0, align 32 %2 = fadd <4 x double> %1, %1 store <4 x double> %2, <4 x double> *%a1, align 32 @@ -1403,10 +1403,10 @@ ; ; ZNVER1-LABEL: test_movaps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x float>, <8 x float> *%a0, align 32 %2 = fadd <8 x float> %1, %1 store <8 x float> %2, <8 x float> *%a1, align 32 @@ -1437,10 +1437,10 @@ ; ; ZNVER1-LABEL: test_movddup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] +; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [8:0.50] ; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> @@ -1468,9 +1468,9 @@ ; ; ZNVER1-LABEL: test_movmskpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ret i32 %1 } @@ -1496,9 +1496,9 @@ ; ; ZNVER1-LABEL: test_movmskps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ret i32 %1 } @@ -1525,9 +1525,9 @@ ; ; ZNVER1-LABEL: test_movntpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x double> %a0, %a0 store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0 ret <4 x double> %1 @@ -1554,9 +1554,9 @@ ; ; ZNVER1-LABEL: test_movntps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <8 x float> %a0, %a0 store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0 ret <8 x float> %1 @@ -1586,10 +1586,10 @@ ; ; ZNVER1-LABEL: test_movshdup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] +; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [8:0.50] ; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> @@ -1621,10 +1621,10 @@ ; ; ZNVER1-LABEL: test_movsldup: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] +; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [8:0.50] ; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> @@ -1658,10 +1658,10 @@ ; ; ZNVER1-LABEL: test_movupd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x double>, <4 x double> *%a0, align 1 %2 = fadd <4 x double> %1, %1 store <4 x double> %2, <4 x double> *%a1, align 1 @@ -1694,10 +1694,10 @@ ; ; ZNVER1-LABEL: test_movups: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x float>, <8 x float> *%a0, align 1 %2 = fadd <8 x float> %1, %1 store <8 x float> %2, <8 x float> *%a1, align 1 @@ -1725,9 +1725,9 @@ ; ; ZNVER1-LABEL: test_mulpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00] -; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fmul <4 x double> %1, %2 @@ -1755,9 +1755,9 @@ ; ; ZNVER1-LABEL: test_mulps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fmul <8 x float> %1, %2 @@ -1788,10 +1788,10 @@ ; ; ZNVER1-LABEL: orpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = or <4 x i64> %1, %2 @@ -1827,10 +1827,10 @@ ; ; ZNVER1-LABEL: test_orps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = or <4 x i64> %1, %2 @@ -1866,10 +1866,10 @@ ; ; ZNVER1-LABEL: test_permilpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [8:0.50] ; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50] ; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> @@ -1901,10 +1901,10 @@ ; ; ZNVER1-LABEL: test_permilpd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] +; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:0.50] ; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> @@ -1936,10 +1936,10 @@ ; ; ZNVER1-LABEL: test_permilps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50] ; ZNVER1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50] ; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> @@ -1971,10 +1971,10 @@ ; ; ZNVER1-LABEL: test_permilps_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] +; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:0.50] ; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> @@ -2004,8 +2004,8 @@ ; ZNVER1-LABEL: test_permilvarpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2) @@ -2035,8 +2035,8 @@ ; ZNVER1-LABEL: test_permilvarpd_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2) @@ -2066,8 +2066,8 @@ ; ZNVER1-LABEL: test_permilvarps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2) @@ -2097,8 +2097,8 @@ ; ZNVER1-LABEL: test_permilvarps_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2) @@ -2130,10 +2130,10 @@ ; ; ZNVER1-LABEL: test_rcpps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00] -; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2) @@ -2166,10 +2166,10 @@ ; ; ZNVER1-LABEL: test_roundpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00] ; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7) @@ -2202,10 +2202,10 @@ ; ; ZNVER1-LABEL: test_roundps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] +; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:1.00] ; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7) @@ -2238,10 +2238,10 @@ ; ; ZNVER1-LABEL: test_rsqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00] -; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2) @@ -2275,9 +2275,9 @@ ; ZNVER1-LABEL: test_shufpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] -; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> @@ -2307,8 +2307,8 @@ ; ZNVER1-LABEL: test_shufps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50] -; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -2339,10 +2339,10 @@ ; ; ZNVER1-LABEL: test_sqrtpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00] -; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) %2 = load <4 x double>, <4 x double> *%a1, align 32 %3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2) @@ -2375,10 +2375,10 @@ ; ; ZNVER1-LABEL: test_sqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00] -; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 %3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2) @@ -2408,9 +2408,9 @@ ; ; ZNVER1-LABEL: test_subpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = fsub <4 x double> %1, %2 @@ -2438,9 +2438,9 @@ ; ; ZNVER1-LABEL: test_subps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = fsub <8 x float> %1, %2 @@ -2477,12 +2477,12 @@ ; ; ZNVER1-LABEL: test_testpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2) @@ -2523,13 +2523,13 @@ ; ; ZNVER1-LABEL: test_testpd_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2) @@ -2568,12 +2568,12 @@ ; ; ZNVER1-LABEL: test_testps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2) @@ -2614,13 +2614,13 @@ ; ; ZNVER1-LABEL: test_testps_ymm: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.50] -; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: setb %al # sched: [1:0.50] -; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.50] +; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2) @@ -2654,9 +2654,9 @@ ; ZNVER1-LABEL: test_unpckhpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] -; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> @@ -2686,8 +2686,8 @@ ; ZNVER1-LABEL: test_unpckhps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50] -; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -2719,9 +2719,9 @@ ; ZNVER1-LABEL: test_unpcklpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] -; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> %2 = load <4 x double>, <4 x double> *%a2, align 32 %3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> @@ -2751,8 +2751,8 @@ ; ZNVER1-LABEL: test_unpcklps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50] -; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> %2 = load <8 x float>, <8 x float> *%a2, align 32 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> @@ -2783,10 +2783,10 @@ ; ; ZNVER1-LABEL: test_xorpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> %3 = xor <4 x i64> %1, %2 @@ -2822,10 +2822,10 @@ ; ; ZNVER1-LABEL: test_xorps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> %3 = xor <4 x i64> %1, %2 @@ -2856,7 +2856,7 @@ ; ZNVER1-LABEL: test_zeroall: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vzeroall # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.avx.vzeroall() ret void } @@ -2881,7 +2881,7 @@ ; ZNVER1-LABEL: test_zeroupper: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.avx.vzeroupper() ret void } Index: llvm/trunk/test/CodeGen/X86/avx2-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-schedule.ll +++ llvm/trunk/test/CodeGen/X86/avx2-schedule.ll @@ -13,10 +13,10 @@ ; ; ZNVER1-LABEL: test_pabsb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) %2 = load <32 x i8>, <32 x i8> *%a1, align 32 %3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2) @@ -35,10 +35,10 @@ ; ; ZNVER1-LABEL: test_pabsd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) %2 = load <8 x i32>, <8 x i32> *%a1, align 32 %3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2) @@ -57,10 +57,10 @@ ; ; ZNVER1-LABEL: test_pabsw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) %2 = load <16 x i16>, <16 x i16> *%a1, align 32 %3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2) @@ -78,9 +78,9 @@ ; ; ZNVER1-LABEL: test_paddb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <32 x i8> %a0, %a1 %2 = load <32 x i8>, <32 x i8> *%a2, align 32 %3 = add <32 x i8> %1, %2 @@ -96,9 +96,9 @@ ; ; ZNVER1-LABEL: test_paddd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = add <8 x i32> %1, %2 @@ -114,9 +114,9 @@ ; ; ZNVER1-LABEL: test_paddq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = add <4 x i64> %1, %2 @@ -132,9 +132,9 @@ ; ; ZNVER1-LABEL: test_paddw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = add <16 x i16> %1, %2 @@ -151,10 +151,10 @@ ; ; ZNVER1-LABEL: test_pand: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = and <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = and <4 x i64> %1, %2 @@ -172,10 +172,10 @@ ; ; ZNVER1-LABEL: test_pandn: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <4 x i64> %a0, %2 = and <4 x i64> %a1, %1 %3 = load <4 x i64>, <4 x i64> *%a2, align 32 @@ -194,9 +194,9 @@ ; ; ZNVER1-LABEL: test_pmulld: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = mul <8 x i32> %1, %2 @@ -212,9 +212,9 @@ ; ; ZNVER1-LABEL: test_pmullw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = mul <16 x i16> %1, %2 @@ -231,10 +231,10 @@ ; ; ZNVER1-LABEL: test_por: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = or <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = or <4 x i64> %1, %2 @@ -251,9 +251,9 @@ ; ; ZNVER1-LABEL: test_psubb: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <32 x i8> %a0, %a1 %2 = load <32 x i8>, <32 x i8> *%a2, align 32 %3 = sub <32 x i8> %1, %2 @@ -269,9 +269,9 @@ ; ; ZNVER1-LABEL: test_psubd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <8 x i32> %a0, %a1 %2 = load <8 x i32>, <8 x i32> *%a2, align 32 %3 = sub <8 x i32> %1, %2 @@ -287,9 +287,9 @@ ; ; ZNVER1-LABEL: test_psubq: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = sub <4 x i64> %1, %2 @@ -305,9 +305,9 @@ ; ; ZNVER1-LABEL: test_psubw: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <16 x i16> %a0, %a1 %2 = load <16 x i16>, <16 x i16> *%a2, align 32 %3 = sub <16 x i16> %1, %2 @@ -324,10 +324,10 @@ ; ; ZNVER1-LABEL: test_pxor: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <4 x i64> %a0, %a1 %2 = load <4 x i64>, <4 x i64> *%a2, align 32 %3 = xor <4 x i64> %1, %2 Index: llvm/trunk/test/CodeGen/X86/bmi-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bmi-schedule.ll +++ llvm/trunk/test/CodeGen/X86/bmi-schedule.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) { ; GENERIC-LABEL: test_andn_i16: @@ -33,6 +33,15 @@ ; BTVER2-NEXT: addl %edi, %eax # sched: [1:0.50] ; BTVER2-NEXT: # kill: %AX %AX %EAX ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnl %esi, %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: notl %edi # sched: [1:0.25] +; ZNVER1-NEXT: andw (%rdx), %di # sched: [5:0.50] +; ZNVER1-NEXT: addl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i16, i16 *%a2 %2 = xor i16 %a0, -1 %3 = and i16 %2, %a1 @@ -62,6 +71,13 @@ ; BTVER2-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnl (%rdx), %edi, %eax # sched: [5:0.50] +; ZNVER1-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.25] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a2 %2 = xor i32 %a0, -1 %3 = and i32 %2, %a1 @@ -91,6 +107,13 @@ ; BTVER2-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andn_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: andnq (%rdx), %rdi, %rax # sched: [5:0.50] +; ZNVER1-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.25] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a2 %2 = xor i64 %a0, -1 %3 = and i64 %2, %a1 @@ -120,6 +143,13 @@ ; BTVER2-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bextr_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a2 %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0) %3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0) @@ -149,6 +179,13 @@ ; BTVER2-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_bextr_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a2 %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0) %3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0) @@ -178,6 +215,13 @@ ; BTVER2-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsi_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a1 %2 = sub i32 0, %1 %3 = sub i32 0, %a0 @@ -208,6 +252,13 @@ ; BTVER2-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsi_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a1 %2 = sub i64 0, %1 %3 = sub i64 0, %a0 @@ -238,6 +289,13 @@ ; BTVER2-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsmsk_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a1 %2 = sub i32 %1, 1 %3 = sub i32 %a0, 1 @@ -268,6 +326,13 @@ ; BTVER2-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsmsk_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a1 %2 = sub i64 %1, 1 %3 = sub i64 %a0, 1 @@ -298,6 +363,13 @@ ; BTVER2-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsr_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a1 %2 = sub i32 %1, 1 %3 = sub i32 %a0, 1 @@ -328,6 +400,13 @@ ; BTVER2-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blsr_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a1 %2 = sub i64 %1, 1 %3 = sub i64 %a0, 1 @@ -361,6 +440,14 @@ ; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: # kill: %AX %AX %EAX ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i16, i16 *%a1 %2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false ) %3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false ) @@ -390,6 +477,13 @@ ; BTVER2-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00] ; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a1 %2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false ) %3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false ) @@ -419,6 +513,13 @@ ; BTVER2-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00] ; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cttz_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00] +; ZNVER1-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a1 %2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false ) %3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false ) Index: llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll +++ llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll @@ -24,8 +24,8 @@ ; ZNVER1: # BB#0: ; ZNVER1-NEXT: bzhil %edi, (%rdx), %ecx # sched: [?:0.000000e+00] ; ZNVER1-NEXT: bzhil %edi, %esi, %eax # sched: [?:0.000000e+00] -; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a2 %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0) %3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0) @@ -53,8 +53,8 @@ ; ZNVER1: # BB#0: ; ZNVER1-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00] ; ZNVER1-NEXT: bzhiq %rdi, %rsi, %rax # sched: [?:0.000000e+00] -; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a2 %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0) %3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0) @@ -82,8 +82,8 @@ ; ZNVER1: # BB#0: ; ZNVER1-NEXT: pdepl (%rdx), %edi, %ecx # sched: [?:0.000000e+00] ; ZNVER1-NEXT: pdepl %esi, %edi, %eax # sched: [?:0.000000e+00] -; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a2 %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1) %3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1) @@ -111,8 +111,8 @@ ; ZNVER1: # BB#0: ; ZNVER1-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00] ; ZNVER1-NEXT: pdepq %rsi, %rdi, %rax # sched: [?:0.000000e+00] -; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a2 %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1) %3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1) @@ -140,8 +140,8 @@ ; ZNVER1: # BB#0: ; ZNVER1-NEXT: pextl (%rdx), %edi, %ecx # sched: [?:0.000000e+00] ; ZNVER1-NEXT: pextl %esi, %edi, %eax # sched: [?:0.000000e+00] -; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a2 %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1) %3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1) @@ -169,8 +169,8 @@ ; ZNVER1: # BB#0: ; ZNVER1-NEXT: pextq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00] ; ZNVER1-NEXT: pextq %rsi, %rdi, %rax # sched: [?:0.000000e+00] -; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.50] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a2 %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1) %3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1) Index: llvm/trunk/test/CodeGen/X86/f16c-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/f16c-schedule.ll +++ llvm/trunk/test/CodeGen/X86/f16c-schedule.ll @@ -29,10 +29,10 @@ ; ; ZNVER1-LABEL: test_vcvtph2ps_128: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00] ; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x i16>, <8 x i16> *%a1 %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1) %3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) @@ -65,10 +65,10 @@ ; ; ZNVER1-LABEL: test_vcvtph2ps_256: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00] -; ZNVER1-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x i16>, <8 x i16> *%a1 %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1) %3 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) @@ -98,9 +98,9 @@ ; ; ZNVER1-LABEL: test_vcvtps2ph_128: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) %2 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a1, i32 0) %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> @@ -132,10 +132,10 @@ ; ; ZNVER1-LABEL: test_vcvtps2ph_256: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [12:1.00] ; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] -; ZNVER1-NEXT: retq # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) %2 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a1, i32 0) store <8 x i16> %2, <8 x i16> *%a2 Index: llvm/trunk/test/CodeGen/X86/lea32-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/lea32-schedule.ll +++ llvm/trunk/test/CodeGen/X86/lea32-schedule.ll @@ -8,7 +8,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i32 @test_lea_offset(i32) { ; GENERIC-LABEL: test_lea_offset: @@ -52,6 +52,12 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -24(%rdi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = add nsw i32 %0, -24 ret i32 %2 } @@ -98,6 +104,12 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 1024(%rdi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = add nsw i32 %0, 1024 ret i32 %2 } @@ -151,6 +163,13 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = add nsw i32 %1, %0 ret i32 %3 } @@ -205,6 +224,13 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = add i32 %0, 16 %4 = add i32 %3, %1 ret i32 %4 @@ -262,6 +288,13 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = add i32 %0, -4096 %4 = add i32 %3, %1 ret i32 %4 @@ -309,6 +342,12 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = mul nsw i32 %0, 3 ret i32 %2 } @@ -357,6 +396,12 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = mul nsw i32 %0, 3 %3 = add nsw i32 %2, -32 ret i32 %3 @@ -408,6 +453,12 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = mul nsw i32 %0, 9 %3 = add nsw i32 %2, 10000 ret i32 %3 @@ -461,6 +512,13 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = shl i32 %1, 1 %4 = add nsw i32 %3, %0 ret i32 %4 @@ -516,6 +574,13 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = shl i32 %1, 2 %4 = add i32 %0, 96 %5 = add i32 %4, %3 @@ -574,6 +639,13 @@ ; BTVER2-NEXT: # kill: %EDI %EDI %RDI ; BTVER2-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: # kill: %ESI %ESI %RSI +; ZNVER1-NEXT: # kill: %EDI %EDI %RDI +; ZNVER1-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = shl i32 %1, 3 %4 = add i32 %0, -1200 %5 = add i32 %4, %3 Index: llvm/trunk/test/CodeGen/X86/lea64-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/lea64-schedule.ll +++ llvm/trunk/test/CodeGen/X86/lea64-schedule.ll @@ -8,7 +8,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i64 @test_lea_offset(i64) { ; GENERIC-LABEL: test_lea_offset: @@ -46,6 +46,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -24(%rdi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = add nsw i64 %0, -24 ret i64 %2 } @@ -86,6 +91,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = add nsw i64 %0, 1024 ret i64 %2 } @@ -127,6 +137,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = add nsw i64 %1, %0 ret i64 %3 } @@ -169,6 +184,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = add i64 %0, 16 %4 = add i64 %3, %1 ret i64 %4 @@ -214,6 +234,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = add i64 %0, -4096 %4 = add i64 %3, %1 ret i64 %4 @@ -255,6 +280,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = mul nsw i64 %0, 3 ret i64 %2 } @@ -297,6 +327,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = mul nsw i64 %0, 3 %3 = add nsw i64 %2, -32 ret i64 %3 @@ -342,6 +377,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_mul_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %2 = mul nsw i64 %0, 9 %3 = add nsw i64 %2, 10000 ret i64 %3 @@ -383,6 +423,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = shl i64 %1, 1 %4 = add nsw i64 %3, %0 ret i64 %4 @@ -426,6 +471,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = shl i64 %1, 2 %4 = add i64 %0, 96 %5 = add i64 %4, %3 @@ -472,6 +522,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lea_add_scale_offset_big: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %3 = shl i64 %1, 3 %4 = add i64 %0, -1200 %5 = add i64 %4, %3 Index: llvm/trunk/test/CodeGen/X86/lzcnt-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/lzcnt-schedule.ll +++ llvm/trunk/test/CodeGen/X86/lzcnt-schedule.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) { ; GENERIC-LABEL: test_ctlz_i16: @@ -30,6 +30,14 @@ ; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: # kill: %AX %AX %EAX ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntw (%rsi), %cx +; ZNVER1-NEXT: lzcntw %di, %ax +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i16, i16 *%a1 %2 = tail call i16 @llvm.ctlz.i16( i16 %1, i1 false ) %3 = tail call i16 @llvm.ctlz.i16( i16 %a0, i1 false ) @@ -59,6 +67,13 @@ ; BTVER2-NEXT: lzcntl %edi, %eax ; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntl (%rsi), %ecx +; ZNVER1-NEXT: lzcntl %edi, %eax +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a1 %2 = tail call i32 @llvm.ctlz.i32( i32 %1, i1 false ) %3 = tail call i32 @llvm.ctlz.i32( i32 %a0, i1 false ) @@ -88,6 +103,13 @@ ; BTVER2-NEXT: lzcntq %rdi, %rax ; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctlz_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lzcntq (%rsi), %rcx +; ZNVER1-NEXT: lzcntq %rdi, %rax +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a1 %2 = tail call i64 @llvm.ctlz.i64( i64 %1, i1 false ) %3 = tail call i64 @llvm.ctlz.i64( i64 %a0, i1 false ) Index: llvm/trunk/test/CodeGen/X86/popcnt-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/popcnt-schedule.ll +++ llvm/trunk/test/CodeGen/X86/popcnt-schedule.ll @@ -8,7 +8,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) { ; GENERIC-LABEL: test_ctpop_i16: @@ -50,6 +50,14 @@ ; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: # kill: %AX %AX %EAX ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntw (%rsi), %cx # sched: [10:1.00] +; ZNVER1-NEXT: popcntw %di, %ax # sched: [3:1.00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i16, i16 *%a1 %2 = tail call i16 @llvm.ctpop.i16( i16 %1 ) %3 = tail call i16 @llvm.ctpop.i16( i16 %a0 ) @@ -93,6 +101,13 @@ ; BTVER2-NEXT: popcntl %edi, %eax # sched: [3:1.00] ; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntl (%rsi), %ecx # sched: [10:1.00] +; ZNVER1-NEXT: popcntl %edi, %eax # sched: [3:1.00] +; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i32, i32 *%a1 %2 = tail call i32 @llvm.ctpop.i32( i32 %1 ) %3 = tail call i32 @llvm.ctpop.i32( i32 %a0 ) @@ -136,6 +151,13 @@ ; BTVER2-NEXT: popcntq %rdi, %rax # sched: [3:1.00] ; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ctpop_i64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: popcntq (%rsi), %rcx # sched: [10:1.00] +; ZNVER1-NEXT: popcntq %rdi, %rax # sched: [3:1.00] +; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64 *%a1 %2 = tail call i64 @llvm.ctpop.i64( i64 %1 ) %3 = tail call i64 @llvm.ctpop.i64( i64 %a0 ) Index: llvm/trunk/test/CodeGen/X86/sse-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; GENERIC-LABEL: test_addps: @@ -45,6 +45,12 @@ ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fadd <4 x float> %1, %2 @@ -87,6 +93,12 @@ ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fadd float %1, %2 @@ -137,6 +149,12 @@ ; BTVER2-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = and <4 x i32> %1, %2 @@ -191,6 +209,12 @@ ; BTVER2-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = xor <4 x i32> %1, @@ -245,6 +269,13 @@ ; BTVER2-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fcmp oeq <4 x float> %a0, %2 @@ -290,6 +321,12 @@ ; BTVER2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = insertelement <4 x float> undef, float %a1, i32 0 %3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0) @@ -385,6 +422,20 @@ ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_comiss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vcomiss (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 4 %3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2) @@ -435,6 +486,13 @@ ; BTVER2-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2ss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i32 %a0 to float %2 = load i32, i32 *%a1, align 4 %3 = sitofp i32 %2 to float @@ -484,6 +542,13 @@ ; BTVER2-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2ssq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i64 %a0 to float %2 = load i64, i64 *%a1, align 8 %3 = sitofp i64 %2 to float @@ -533,6 +598,13 @@ ; BTVER2-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtss2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -585,6 +657,13 @@ ; BTVER2-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtss2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -637,6 +716,13 @@ ; BTVER2-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttss2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttss2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi float %a0 to i32 %2 = load float, float *%a1, align 4 %3 = fptosi float %2 to i32 @@ -686,6 +772,13 @@ ; BTVER2-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttss2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttss2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi float %a0 to i64 %2 = load float, float *%a1, align 4 %3 = fptosi float %2 to i64 @@ -729,6 +822,12 @@ ; BTVER2-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fdiv <4 x float> %1, %2 @@ -771,6 +870,12 @@ ; BTVER2-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fdiv float %1, %2 @@ -813,6 +918,12 @@ ; BTVER2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; BTVER2-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ldmxcsr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50] +; ZNVER1-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* store i32 %a0, i32* %1 @@ -857,6 +968,12 @@ ; BTVER2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2) @@ -900,6 +1017,12 @@ ; BTVER2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2) @@ -943,6 +1066,12 @@ ; BTVER2-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2) @@ -986,6 +1115,12 @@ ; BTVER2-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2) @@ -1035,6 +1170,13 @@ ; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movaps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 16 %2 = fadd <4 x float> %1, %1 store <4 x float> %2, <4 x float> *%a1, align 16 @@ -1079,6 +1221,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhlps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> ret <4 x float> %1 } @@ -1129,6 +1276,13 @@ ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* %2 = load <2 x float>, <2 x float> *%1, align 8 %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> @@ -1177,6 +1331,12 @@ ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = fadd <4 x float> %a1, %1 ret <4 x float> %2 @@ -1224,6 +1384,13 @@ ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* %2 = load <2 x float>, <2 x float> *%1, align 8 %3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> @@ -1266,6 +1433,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ret i32 %1 } @@ -1307,6 +1479,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0 ret void } @@ -1353,6 +1530,13 @@ ; BTVER2-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movss_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovss %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load float, float* %a0, align 1 %2 = fadd float %1, %1 store float %2, float *%a1, align 1 @@ -1395,6 +1579,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movss_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> ret <4 x float> %1 } @@ -1441,6 +1630,13 @@ ; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movups: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovups (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovups %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <4 x float>, <4 x float> *%a0, align 1 %2 = fadd <4 x float> %1, %1 store <4 x float> %2, <4 x float> *%a1, align 1 @@ -1483,6 +1679,12 @@ ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fmul <4 x float> %1, %2 @@ -1525,6 +1727,12 @@ ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fmul float %1, %2 @@ -1575,6 +1783,12 @@ ; BTVER2-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = or <4 x i32> %1, %2 @@ -1621,6 +1835,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: prefetchnta (%rdi) # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_prefetchnta: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: prefetchnta (%rdi) # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1) ret void } @@ -1670,6 +1889,13 @@ ; BTVER2-NEXT: vrcpps %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrcpps (%rdi), %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2) @@ -1728,6 +1954,14 @@ ; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rcpss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [12:0.50] +; ZNVER1-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -1782,6 +2016,13 @@ ; BTVER2-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2) @@ -1840,6 +2081,14 @@ ; BTVER2-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_rsqrtss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:0.50] +; ZNVER1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x float> undef, float %a0, i32 0 %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1) %3 = load float, float *%a1, align 4 @@ -1886,6 +2135,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: sfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: sfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse.sfence() ret void } @@ -1931,6 +2185,12 @@ ; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50] ; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50] +; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -1980,6 +2240,13 @@ ; BTVER2-NEXT: vsqrtps %xmm0, %xmm0 # sched: [21:21.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtps (%rdi), %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtps %xmm0, %xmm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2) @@ -2038,6 +2305,14 @@ ; BTVER2-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovaps (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2) @@ -2082,6 +2357,12 @@ ; BTVER2-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] ; BTVER2-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_stmxcsr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50] +; ZNVER1-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = alloca i32, align 4 %2 = bitcast i32* %1 to i8* call void @llvm.x86.sse.stmxcsr(i8* %2) @@ -2126,6 +2407,12 @@ ; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <4 x float> %a0, %a1 %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = fsub <4 x float> %1, %2 @@ -2168,6 +2455,12 @@ ; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub float %a0, %a1 %2 = load float, float *%a2, align 4 %3 = fsub float %1, %2 @@ -2258,6 +2551,20 @@ ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ucomiss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vucomiss (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 4 %3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2) @@ -2306,6 +2613,12 @@ ; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] ; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -2352,6 +2665,12 @@ ; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] ; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -2402,6 +2721,12 @@ ; BTVER2-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <4 x float> %a0 to <4 x i32> %2 = bitcast <4 x float> %a1 to <4 x i32> %3 = xor <4 x i32> %1, %2 Index: llvm/trunk/test/CodeGen/X86/sse2-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse2-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_addpd: @@ -45,6 +45,12 @@ ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fadd <2 x double> %1, %2 @@ -87,6 +93,12 @@ ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fadd double %1, %2 @@ -135,6 +147,13 @@ ; BTVER2-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = and <4 x i32> %1, %2 @@ -188,6 +207,13 @@ ; BTVER2-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_andnotpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = xor <4 x i32> %1, @@ -243,6 +269,13 @@ ; BTVER2-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fcmp oeq <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fcmp oeq <2 x double> %a0, %2 @@ -288,6 +321,12 @@ ; BTVER2-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cmpsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = insertelement <2 x double> undef, double %a1, i32 0 %3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0) @@ -383,6 +422,20 @@ ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_comisd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vcomisd (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 8 %3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2) @@ -433,6 +486,13 @@ ; BTVER2-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> %2 = sitofp <2 x i32> %1 to <2 x double> %3 = load <4 x i32>, <4 x i32>*%a1, align 16 @@ -485,6 +545,13 @@ ; BTVER2-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtdq2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp <4 x i32> %a0 to <4 x float> %2 = load <4 x i32>, <4 x i32>*%a1, align 16 %3 = sitofp <4 x i32> %2 to <4 x float> @@ -535,6 +602,13 @@ ; BTVER2-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2) @@ -586,6 +660,13 @@ ; BTVER2-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtpd2ps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2) @@ -637,6 +718,13 @@ ; BTVER2-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2) @@ -688,6 +776,13 @@ ; BTVER2-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtps2pd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> %2 = fpext <2 x float> %1 to <2 x double> %3 = load <4 x float>, <4 x float> *%a1, align 16 @@ -739,6 +834,13 @@ ; BTVER2-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsd2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtsd2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1) %3 = load double, double *%a1, align 8 @@ -791,6 +893,13 @@ ; BTVER2-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsd2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x double> undef, double %a0, i32 0 %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1) %3 = load double, double *%a1, align 8 @@ -850,6 +959,14 @@ ; BTVER2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] ; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsd2ss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptrunc double %a0 to float %2 = load double, double *%a1, align 8 %3 = fptrunc double %2 to float @@ -899,6 +1016,13 @@ ; BTVER2-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2sd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i32 %a0 to double %2 = load i32, i32 *%a1, align 8 %3 = sitofp i32 %2 to double @@ -948,6 +1072,13 @@ ; BTVER2-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtsi2sdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sitofp i64 %a0 to double %2 = load i64, i64 *%a1, align 8 %3 = sitofp i64 %2 to double @@ -1006,6 +1137,14 @@ ; BTVER2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] ; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvtss2sd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fpext float %a0 to double %2 = load float, float *%a1, align 4 %3 = fpext float %2 to double @@ -1056,6 +1195,13 @@ ; BTVER2-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttpd2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <2 x double> %a0 to <2 x i32> %2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> %3 = load <2 x double>, <2 x double> *%a1, align 16 @@ -1108,6 +1254,13 @@ ; BTVER2-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttps2dq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [12:1.00] +; ZNVER1-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi <4 x float> %a0 to <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = fptosi <4 x float> %2 to <4 x i32> @@ -1157,6 +1310,13 @@ ; BTVER2-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] ; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttsd2si: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttsd2si (%rdi), %eax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00] +; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi double %a0 to i32 %2 = load double, double *%a1, align 8 %3 = fptosi double %2 to i32 @@ -1206,6 +1366,13 @@ ; BTVER2-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] ; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_cvttsd2siq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vcvttsd2si (%rdi), %rax # sched: [12:1.00] +; ZNVER1-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00] +; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fptosi double %a0 to i64 %2 = load double, double *%a1, align 8 %3 = fptosi double %2 to i64 @@ -1249,6 +1416,12 @@ ; BTVER2-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fdiv <2 x double> %1, %2 @@ -1291,6 +1464,12 @@ ; BTVER2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] ; BTVER2-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_divsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [15:1.00] +; ZNVER1-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [22:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fdiv double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fdiv double %1, %2 @@ -1333,6 +1512,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: lfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: lfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.lfence() ret void } @@ -1374,6 +1558,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: mfence # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mfence: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: mfence # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.mfence() ret void } @@ -1413,6 +1602,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maskmovdqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) ret void } @@ -1454,6 +1648,12 @@ ; BTVER2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2) @@ -1497,6 +1697,12 @@ ; BTVER2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_maxsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2) @@ -1540,6 +1746,12 @@ ; BTVER2-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2) @@ -1583,6 +1795,12 @@ ; BTVER2-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_minsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2) @@ -1632,6 +1850,13 @@ ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movapd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x double>, <2 x double> *%a0, align 16 %2 = fadd <2 x double> %1, %1 store <2 x double> %2, <2 x double> *%a1, align 16 @@ -1680,6 +1905,13 @@ ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovdqa (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x i64>, <2 x i64> *%a0, align 16 %2 = add <2 x i64> %1, %1 store <2 x i64> %2, <2 x i64> *%a1, align 16 @@ -1728,6 +1960,13 @@ ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movdqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovdqu (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x i64>, <2 x i64> *%a0, align 1 %2 = add <2 x i64> %1, %1 store <2 x i64> %2, <2 x i64> *%a1, align 1 @@ -1794,6 +2033,16 @@ ; BTVER2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vmovd %edi, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vmovd %xmm1, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x i32> undef, i32 %a1, i32 0 %2 = load i32, i32 *%a2 %3 = insertelement <4 x i32> undef, i32 %2, i32 0 @@ -1865,6 +2114,16 @@ ; BTVER2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movd_64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm1, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm0, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x i64> undef, i64 %a1, i64 0 %2 = load i64, i64 *%a2 %3 = insertelement <2 x i64> undef, i64 %2, i64 0 @@ -1918,6 +2177,13 @@ ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to double* %2 = load double, double *%1, align 8 %3 = insertelement <2 x double> %a1, double %2, i32 1 @@ -1969,6 +2235,13 @@ ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movlpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast x86_mmx* %a2 to double* %2 = load double, double *%1, align 8 %3 = insertelement <2 x double> %a1, double %2, i32 0 @@ -2010,6 +2283,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movmskpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ret i32 %1 } @@ -2053,6 +2331,12 @@ ; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <2 x i64> %a0, %a0 store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0 ret void @@ -2094,6 +2378,12 @@ ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fadd <2 x double> %a0, %a0 store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0 ret void @@ -2141,6 +2431,13 @@ ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movq_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vmovq %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load i64, i64* %a1, align 1 %2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0 %3 = add <2 x i64> %a0, %2 @@ -2187,6 +2484,12 @@ ; BTVER2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movq_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> %2 = add <2 x i64> %a1, %1 ret <2 x i64> %2 @@ -2234,6 +2537,13 @@ ; BTVER2-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsd_mem: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [8:0.50] +; ZNVER1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load double, double* %a0, align 1 %2 = fadd double %1, %1 store double %2, double *%a1, align 1 @@ -2277,6 +2587,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsd_reg: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> ret <2 x double> %1 } @@ -2323,6 +2638,13 @@ ; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movupd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovupd (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <2 x double>, <2 x double> *%a0, align 1 %2 = fadd <2 x double> %1, %1 store <2 x double> %2, <2 x double> *%a1, align 1 @@ -2365,6 +2687,12 @@ ; BTVER2-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fmul <2 x double> %1, %2 @@ -2407,6 +2735,12 @@ ; BTVER2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mulsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; ZNVER1-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [12:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fmul double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fmul double %1, %2 @@ -2455,6 +2789,13 @@ ; BTVER2-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_orpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = or <4 x i32> %1, %2 @@ -2510,6 +2851,12 @@ ; BTVER2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packssdw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <8 x i16> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -2562,6 +2909,12 @@ ; BTVER2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packsswb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -2614,6 +2967,12 @@ ; BTVER2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packuswb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -2662,6 +3021,12 @@ ; BTVER2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = add <16 x i8> %1, %2 @@ -2708,6 +3073,12 @@ ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = add <4 x i32> %1, %2 @@ -2750,6 +3121,12 @@ ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = add <2 x i64> %1, %2 @@ -2796,6 +3173,12 @@ ; BTVER2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2) @@ -2843,6 +3226,12 @@ ; BTVER2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2) @@ -2890,6 +3279,12 @@ ; BTVER2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddusb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2) @@ -2937,6 +3332,12 @@ ; BTVER2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddusw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2) @@ -2984,6 +3385,12 @@ ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_paddw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = add <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = add <8 x i16> %1, %2 @@ -3032,6 +3439,13 @@ ; BTVER2-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pand: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = and <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = and <2 x i64> %1, %2 @@ -3087,6 +3501,13 @@ ; BTVER2-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pandn: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <2 x i64> %a0, %2 = and <2 x i64> %a1, %1 %3 = load <2 x i64>, <2 x i64> *%a2, align 16 @@ -3136,6 +3557,12 @@ ; BTVER2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pavgb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2) @@ -3183,6 +3610,12 @@ ; BTVER2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pavgw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2) @@ -3234,6 +3667,13 @@ ; BTVER2-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = icmp eq <16 x i8> %a0, %2 @@ -3286,6 +3726,13 @@ ; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = icmp eq <4 x i32> %a0, %2 @@ -3338,6 +3785,13 @@ ; BTVER2-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = icmp eq <8 x i16> %a0, %2 @@ -3391,6 +3845,13 @@ ; BTVER2-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = icmp sgt <16 x i8> %a0, %2 @@ -3444,6 +3905,13 @@ ; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = icmp eq <4 x i32> %a0, %2 @@ -3497,6 +3965,13 @@ ; BTVER2-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = icmp sgt <8 x i16> %a0, %2 @@ -3541,6 +4016,12 @@ ; BTVER2-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: # kill: %AX %AX %EAX ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: # kill: %AX %AX %EAX +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <8 x i16> %a0, i32 6 ret i16 %1 } @@ -3585,6 +4066,12 @@ ; BTVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <8 x i16> %a0, i16 %a1, i32 1 %2 = load i16, i16 *%a2 %3 = insertelement <8 x i16> %1, i16 %2, i32 3 @@ -3635,6 +4122,12 @@ ; BTVER2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaddwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) %2 = bitcast <4 x i32> %1 to <8 x i16> %3 = load <8 x i16>, <8 x i16> *%a2, align 16 @@ -3683,6 +4176,12 @@ ; BTVER2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2) @@ -3730,6 +4229,12 @@ ; BTVER2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxub: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2) @@ -3777,6 +4282,12 @@ ; BTVER2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2) @@ -3824,6 +4335,12 @@ ; BTVER2-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminub: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2) @@ -3863,6 +4380,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovmskb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ret i32 %1 } @@ -3904,6 +4426,12 @@ ; BTVER2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2) @@ -3947,6 +4475,12 @@ ; BTVER2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2) @@ -3990,6 +4524,12 @@ ; BTVER2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmullw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = mul <8 x i16> %1, %2 @@ -4040,6 +4580,12 @@ ; BTVER2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmuludq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <2 x i64> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -4090,6 +4636,13 @@ ; BTVER2-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_por: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = or <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = or <2 x i64> %1, %2 @@ -4141,6 +4694,12 @@ ; BTVER2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psadbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) %2 = bitcast <2 x i64> %1 to <16 x i8> %3 = load <16 x i8>, <16 x i8> *%a2, align 16 @@ -4193,6 +4752,13 @@ ; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50] +; ZNVER1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> @@ -4244,6 +4810,13 @@ ; BTVER2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufhw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [8:0.50] +; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> @@ -4295,6 +4868,13 @@ ; BTVER2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshuflw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [8:0.50] +; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> @@ -4344,6 +4924,13 @@ ; BTVER2-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pslld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2) @@ -4389,6 +4976,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pslldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %1 } @@ -4435,6 +5027,13 @@ ; BTVER2-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psllq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2) @@ -4486,6 +5085,13 @@ ; BTVER2-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psllw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2) @@ -4537,6 +5143,13 @@ ; BTVER2-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrad: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2) @@ -4588,6 +5201,13 @@ ; BTVER2-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psraw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2) @@ -4639,6 +5259,13 @@ ; BTVER2-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2) @@ -4684,6 +5311,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %1 } @@ -4730,6 +5362,13 @@ ; BTVER2-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrlq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2) @@ -4781,6 +5420,13 @@ ; BTVER2-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psrlw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2) @@ -4830,6 +5476,12 @@ ; BTVER2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <16 x i8> %a0, %a1 %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = sub <16 x i8> %1, %2 @@ -4876,6 +5528,12 @@ ; BTVER2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = sub <4 x i32> %1, %2 @@ -4918,6 +5576,12 @@ ; BTVER2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = sub <2 x i64> %1, %2 @@ -4964,6 +5628,12 @@ ; BTVER2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2) @@ -5011,6 +5681,12 @@ ; BTVER2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2) @@ -5058,6 +5734,12 @@ ; BTVER2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubusb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2) @@ -5105,6 +5787,12 @@ ; BTVER2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubusw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2) @@ -5152,6 +5840,12 @@ ; BTVER2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psubw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = sub <8 x i16> %a0, %a1 %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = sub <8 x i16> %1, %2 @@ -5198,6 +5892,12 @@ ; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] ; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> @@ -5248,6 +5948,13 @@ ; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> @@ -5297,6 +6004,13 @@ ; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhqdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> @@ -5344,6 +6058,12 @@ ; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckhwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> @@ -5390,6 +6110,12 @@ ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> @@ -5440,6 +6166,13 @@ ; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpckldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.25] +; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> @@ -5489,6 +6222,13 @@ ; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklqdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> @@ -5536,6 +6276,12 @@ ; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] ; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_punpcklwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25] +; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> @@ -5584,6 +6330,13 @@ ; BTVER2-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pxor: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = xor <2 x i64> %a0, %a1 %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = xor <2 x i64> %1, %2 @@ -5633,6 +6386,13 @@ ; BTVER2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_shufpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> @@ -5683,6 +6443,13 @@ ; BTVER2-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:21.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [20:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2) @@ -5741,6 +6508,14 @@ ; BTVER2-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_sqrtsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovapd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [27:1.00] +; ZNVER1-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [27:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) @@ -5785,6 +6560,12 @@ ; BTVER2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub <2 x double> %a0, %a1 %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fsub <2 x double> %1, %2 @@ -5827,6 +6608,12 @@ ; BTVER2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_subsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = fsub double %a0, %a1 %2 = load double, double *%a2, align 8 %3 = fsub double %1, %2 @@ -5917,6 +6704,20 @@ ; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ucomisd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: vucomisd (%rdi), %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: setnp %al # sched: [1:0.25] +; ZNVER1-NEXT: sete %dl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25] +; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 8 %3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2) @@ -5967,6 +6768,13 @@ ; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpckhpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> @@ -6022,6 +6830,13 @@ ; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_unpcklpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> @@ -6071,6 +6886,13 @@ ; BTVER2-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_xorpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = bitcast <2 x double> %a0 to <4 x i32> %2 = bitcast <2 x double> %a1 to <4 x i32> %3 = xor <4 x i32> %1, %2 Index: llvm/trunk/test/CodeGen/X86/sse3-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse3-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse3-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_addsubpd: @@ -45,6 +45,12 @@ ; BTVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2) @@ -88,6 +94,12 @@ ; BTVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_addsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2) @@ -131,6 +143,12 @@ ; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2) @@ -174,6 +192,12 @@ ; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_haddps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2) @@ -217,6 +241,12 @@ ; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2) @@ -260,6 +290,12 @@ ; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_hsubps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2) @@ -299,6 +335,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_lddqu: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vlddqu (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ret <16 x i8> %1 } @@ -347,6 +388,13 @@ ; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movddup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50] +; ZNVER1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer @@ -397,6 +445,13 @@ ; BTVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movshdup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [8:0.50] +; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> @@ -447,6 +502,13 @@ ; BTVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movsldup: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [8:0.50] +; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> Index: llvm/trunk/test/CodeGen/X86/sse41-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse41-schedule.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_blendpd: @@ -43,6 +43,13 @@ ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = fadd <2 x double> %a1, %1 @@ -80,6 +87,12 @@ ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] +; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> @@ -122,6 +135,12 @@ ; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) %2 = load <2 x double>, <2 x double> *%a3, align 16 %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2) @@ -165,6 +184,12 @@ ; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_blendvps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) %2 = load <4 x float>, <4 x float> *%a3 %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2) @@ -202,6 +227,12 @@ ; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dppd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) %2 = load <2 x double>, <2 x double> *%a2, align 16 %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7) @@ -239,6 +270,12 @@ ; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_dpps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7) @@ -276,6 +313,12 @@ ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50] ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50] +; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) %2 = load float, float *%a2 %3 = insertelement <4 x float> %1, float %2, i32 3 @@ -308,6 +351,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntdqa: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0) ret <2 x i64> %1 } @@ -343,6 +391,12 @@ ; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00] ; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_mpsadbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = bitcast <8 x i16> %1 to <16 x i8> %3 = load <16 x i8>, <16 x i8> *%a2, align 16 @@ -381,6 +435,12 @@ ; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_packusdw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <8 x i16> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -425,6 +485,12 @@ ; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pblendvb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; ZNVER1-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) %2 = load <16 x i8>, <16 x i8> *%a3, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2) @@ -462,6 +528,12 @@ ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pblendw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] +; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> @@ -498,6 +570,12 @@ ; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpeqq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp eq <2 x i64> %a0, %a1 %2 = sext <2 x i1> %1 to <2 x i64> %3 = load <2 x i64>, <2 x i64>*%a2, align 16 @@ -536,6 +614,12 @@ ; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <16 x i8> %a0, i32 3 %2 = extractelement <16 x i8> %a0, i32 1 store i8 %2, i8 *%a1 @@ -573,6 +657,12 @@ ; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <4 x i32> %a0, i32 3 %2 = extractelement <4 x i32> %a0, i32 1 store i32 %2, i32 *%a1 @@ -609,6 +699,12 @@ ; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50] ; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <2 x i64> %a0, i32 1 %2 = extractelement <2 x i64> %a0, i32 1 store i64 %2, i64 *%a2 @@ -645,6 +741,12 @@ ; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50] ; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pextrw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [8:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = extractelement <8 x i16> %a0, i32 3 %2 = extractelement <8 x i16> %a0, i32 1 store i16 %2, i16 *%a1 @@ -682,6 +784,12 @@ ; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phminposuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = load <8 x i16>, <8 x i16> *%a0, align 16 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1) %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2) @@ -719,6 +827,12 @@ ; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1 %2 = load i8, i8 *%a2 %3 = insertelement <16 x i8> %1, i8 %2, i32 3 @@ -755,6 +869,12 @@ ; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1 %2 = load i32, i32 *%a2 %3 = insertelement <4 x i32> %1, i32 %2, i32 3 @@ -796,6 +916,13 @@ ; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pinsrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1 %2 = load i64, i64 *%a3 %3 = insertelement <2 x i64> %a1, i64 %2, i32 1 @@ -833,6 +960,12 @@ ; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2) @@ -870,6 +1003,12 @@ ; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2) @@ -907,6 +1046,12 @@ ; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxud: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2) @@ -944,6 +1089,12 @@ ; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaxuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2) @@ -981,6 +1132,12 @@ ; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2) @@ -1018,6 +1175,12 @@ ; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2) @@ -1055,6 +1218,12 @@ ; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminud: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2) @@ -1092,6 +1261,12 @@ ; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pminuw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2) @@ -1135,6 +1310,13 @@ ; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> %2 = sext <8 x i8> %1 to <8 x i16> %3 = load <8 x i8>, <8 x i8>* %a1, align 1 @@ -1179,6 +1361,13 @@ ; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> %2 = sext <4 x i8> %1 to <4 x i32> %3 = load <4 x i8>, <4 x i8>* %a1, align 1 @@ -1223,6 +1412,13 @@ ; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxbq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> %2 = sext <2 x i8> %1 to <2 x i64> %3 = load <2 x i8>, <2 x i8>* %a1, align 1 @@ -1267,6 +1463,13 @@ ; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> %2 = sext <2 x i32> %1 to <2 x i64> %3 = load <2 x i32>, <2 x i32>* %a1, align 1 @@ -1311,6 +1514,13 @@ ; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = sext <4 x i16> %1 to <4 x i32> %3 = load <4 x i16>, <4 x i16>* %a1, align 1 @@ -1355,6 +1565,13 @@ ; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovsxwq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = sext <2 x i16> %1 to <2 x i64> %3 = load <2 x i16>, <2 x i16>* %a1, align 1 @@ -1399,6 +1616,13 @@ ; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> %2 = zext <8 x i8> %1 to <8 x i16> %3 = load <8 x i8>, <8 x i8>* %a1, align 1 @@ -1443,6 +1667,13 @@ ; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> %2 = zext <4 x i8> %1 to <4 x i32> %3 = load <4 x i8>, <4 x i8>* %a1, align 1 @@ -1487,6 +1718,13 @@ ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxbq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> %2 = zext <2 x i8> %1 to <2 x i64> %3 = load <2 x i8>, <2 x i8>* %a1, align 1 @@ -1531,6 +1769,13 @@ ; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxdq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> %2 = zext <2 x i32> %1 to <2 x i64> %3 = load <2 x i32>, <2 x i32>* %a1, align 1 @@ -1575,6 +1820,13 @@ ; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50] ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxwd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = zext <4 x i16> %1 to <4 x i32> %3 = load <4 x i16>, <4 x i16>* %a1, align 1 @@ -1619,6 +1871,13 @@ ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmovzxwq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:0.50] +; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.25] +; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = zext <2 x i16> %1 to <2 x i64> %3 = load <2 x i16>, <2 x i16>* %a1, align 1 @@ -1657,6 +1916,12 @@ ; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmuldq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) %2 = bitcast <2 x i64> %1 to <4 x i32> %3 = load <4 x i32>, <4 x i32> *%a2, align 16 @@ -1695,6 +1960,12 @@ ; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulld: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = mul <4 x i32> %a0, %a1 %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = mul <4 x i32> %1, %2 @@ -1751,6 +2022,16 @@ ; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] ; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_ptest: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: setb %al # sched: [1:0.25] +; ZNVER1-NEXT: vptest (%rdi), %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: setb %cl # sched: [1:0.25] +; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25] +; ZNVER1-NEXT: movzbl %cl, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) %2 = load <2 x i64>, <2 x i64> *%a2, align 16 %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2) @@ -1795,6 +2076,13 @@ ; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundpd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:1.00] +; ZNVER1-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7) @@ -1839,6 +2127,13 @@ ; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundps: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:1.00] +; ZNVER1-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) %2 = load <4 x float>, <4 x float> *%a1, align 16 %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7) @@ -1884,6 +2179,13 @@ ; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) %2 = load <2 x double>, <2 x double>* %a2, align 16 %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7) @@ -1929,6 +2231,13 @@ ; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_roundss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; ZNVER1-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) %2 = load <4 x float>, <4 x float> *%a2, align 16 %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7) Index: llvm/trunk/test/CodeGen/X86/sse42-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse42-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse42-schedule.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) { ; GENERIC-LABEL: crc32_32_8: @@ -43,6 +43,13 @@ ; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_8: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1) %2 = load i8, i8 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2) @@ -85,6 +92,13 @@ ; BTVER2-NEXT: crc32w (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_16: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32w %si, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32w (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1) %2 = load i16, i16 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2) @@ -127,6 +141,13 @@ ; BTVER2-NEXT: crc32l (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_32_32: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32l %esi, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32l (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1) %2 = load i32, i32 *%a2 %3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2) @@ -169,6 +190,13 @@ ; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_64_8: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00] +; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1) %2 = load i8, i8 *%a2 %3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2) @@ -211,6 +239,13 @@ ; BTVER2-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00] ; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: crc32_64_64: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] +; ZNVER1-NEXT: crc32q (%rdx), %rdi # sched: [10:1.00] +; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1) %2 = load i64, i64 *%a2 %3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2) @@ -283,6 +318,19 @@ ; BTVER2-NEXT: # kill: %ECX %ECX %RCX ; BTVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpestri: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: movl %ecx, %esi # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: # kill: %ECX %ECX %RCX +; ZNVER1-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7) @@ -341,6 +389,16 @@ ; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17] ; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpestrm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25] +; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) @@ -393,6 +451,15 @@ ; BTVER2-NEXT: # kill: %ECX %ECX %RCX ; BTVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpistri: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: movl %ecx, %eax # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: # kill: %ECX %ECX %RCX +; ZNVER1-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7) @@ -431,6 +498,12 @@ ; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpistrm: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:0.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7) @@ -468,6 +541,12 @@ ; BTVER2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pcmpgtq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = icmp sgt <2 x i64> %a0, %a1 %2 = sext <2 x i1> %1 to <2 x i64> %3 = load <2 x i64>, <2 x i64>*%a2, align 16 Index: llvm/trunk/test/CodeGen/X86/sse4a-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse4a-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse4a-schedule.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1 define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) { ; GENERIC-LABEL: test_extrq: @@ -13,6 +13,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extrq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: extrq %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1) ret <2 x i64> %1 } @@ -28,6 +33,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_extrqi: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: extrq $2, $3, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2) ret <2 x i64> %1 } @@ -43,6 +53,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: insertq %xmm1, %xmm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertq: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: insertq %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1) ret <2 x i64> %1 } @@ -58,6 +73,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_insertqi: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6) ret <2 x i64> %1 } @@ -73,6 +93,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movntsd %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) ret void } @@ -88,6 +113,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_movntss: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: movntss %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) ret void } Index: llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll +++ llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll @@ -7,7 +7,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1 define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) { ; GENERIC-LABEL: test_pabsb: @@ -52,6 +52,13 @@ ; BTVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsb (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) %2 = load <16 x i8>, <16 x i8> *%a1, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2) @@ -103,6 +110,13 @@ ; BTVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsd (%rdi), %xmm1 # sched: [8:0.50] +; ZNVER1-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) %2 = load <4 x i32>, <4 x i32> *%a1, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2) @@ -147,6 +161,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pabsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) %2 = load <8 x i16>, <8 x i16> *%a1, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2) @@ -196,6 +215,12 @@ ; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50] ; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_palignr: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.25] +; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> @@ -238,6 +263,12 @@ ; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2) @@ -289,6 +320,12 @@ ; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -332,6 +369,12 @@ ; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phaddw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2) @@ -375,6 +418,12 @@ ; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2) @@ -426,6 +475,12 @@ ; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -469,6 +524,12 @@ ; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_phsubw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2) @@ -512,6 +573,12 @@ ; BTVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmaddubsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = bitcast <8 x i16> %1 to <16 x i8> @@ -550,6 +617,11 @@ ; BTVER2: # BB#0: ; BTVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pmulhrsw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2) @@ -593,6 +665,12 @@ ; BTVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_pshufb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2) @@ -644,6 +722,12 @@ ; BTVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignb: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) %2 = load <16 x i8>, <16 x i8> *%a2, align 16 %3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2) @@ -695,6 +779,12 @@ ; BTVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignd: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) %2 = load <4 x i32>, <4 x i32> *%a2, align 16 %3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2) @@ -746,6 +836,12 @@ ; BTVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_psignw: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.25] +; ZNVER1-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [8:0.50] +; ZNVER1-NEXT: retq # sched: [5:0.50] %1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) %2 = load <8 x i16>, <8 x i16> *%a2, align 16 %3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2)