Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -431,6 +431,7 @@ include "X86SchedBroadwell.td" include "X86ScheduleSLM.td" include "X86ScheduleZnver1.td" +include "X86ScheduleBdVer2.td" include "X86ScheduleBtVer2.td" include "X86SchedSkylakeClient.td" include "X86SchedSkylakeServer.td" @@ -996,7 +997,7 @@ ]>; // Bulldozer -def : Proc<"bdver1", [ +def : ProcessorModel<"bdver1", BdVer2Model, [ FeatureX87, FeatureCMOV, FeatureXOP, @@ -1021,7 +1022,7 @@ FeatureMacroFusion ]>; // Piledriver -def : Proc<"bdver2", [ +def : ProcessorModel<"bdver2", BdVer2Model, [ FeatureX87, FeatureCMOV, FeatureXOP, Index: lib/Target/X86/X86PfmCounters.td =================================================================== --- lib/Target/X86/X86PfmCounters.td +++ lib/Target/X86/X86PfmCounters.td @@ -75,6 +75,25 @@ def SKXUopsCounter : PfmUopsCounter<"uops_issued:any">; } +let SchedModel = BdVer2Model in { +def PdCycleCounter : PfmCycleCounter<"cpu_clk_unhalted">; +def PdUopsCounter : PfmUopsCounter<"retired_uops">; +def PdFPU0Counter : PfmIssueCounter; +def PdFPU1Counter : PfmIssueCounter; +def PdFPU2Counter : PfmIssueCounter; +def PdFPU3Counter : PfmIssueCounter; + +// NOTE: it would seem, there are not uops counters for any other pipes. +} + let SchedModel = BtVer2Model in { def JCycleCounter : PfmCycleCounter<"cpu_clk_unhalted">; def JUopsCounter : PfmUopsCounter<"retired_uops">; Index: lib/Target/X86/X86ScheduleBdVer2.td =================================================================== --- /dev/null +++ lib/Target/X86/X86ScheduleBdVer2.td @@ -0,0 +1,1173 @@ +//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for AMD bdver2 (Piledriver) to support +// instruction scheduling and other instruction cost heuristics. +// Based on: +// * AMD Software Optimization Guide for AMD Family 15h Processors. +// https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf +// * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog +// http://www.agner.org/optimize/microarchitecture.pdf +// * https://www.realworldtech.com/bulldozer/ +// Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2. +// +//===----------------------------------------------------------------------===// + +def BdVer2Model : SchedMachineModel { + let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired. + let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed. + let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer. + let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency. + let HighLatency = 25; // FIXME: any better choice? + let MispredictPenalty = 20; // Minimum branch misdirection penalty. + + let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. + + // FIXME: Incomplete. This flag is set to allow the scheduler to assign + // a default model to unrecognized opcodes. + let CompleteModel = 0; +} // SchedMachineModel + +let SchedModel = BdVer2Model in { + + +//===----------------------------------------------------------------------===// +// Pipes +//===----------------------------------------------------------------------===// + +// There are total of eight pipes. + +//===----------------------------------------------------------------------===// +// Integer execution pipes +// + +// Two EX (ALU) pipes. +def PdEX0 : ProcResource<1>; // ALU, Integer Pipe0 +def PdEX1 : ProcResource<1>; // ALU, Integer Pipe1 +def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>; + +// Two AGLU pipes, identical. +def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23] + +//===----------------------------------------------------------------------===// +// Floating point execution pipes +// + +// Four FPU pipes. + +def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0 +def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1 +def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2 +def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3 + +// FPU grouping +def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>; +def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>; + + +//===----------------------------------------------------------------------===// +// RCU +//===----------------------------------------------------------------------===// + +// The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle. +// On the other hand, the RCU reorder buffer size for Piledriver does not +// seem be specified in any trustworthy source. +// But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had +// RCU reorder buffer size of 128. So that is a good guess for now. +def PdRCU : RetireControlUnit<128, 4>; + + +//===----------------------------------------------------------------------===// +// Pipelines +//===----------------------------------------------------------------------===// + +// There are total of two pipelines, each one with it's own scheduler. + +//===----------------------------------------------------------------------===// +// Integer Pipeline Scheduling +// + +// There is one Integer Scheduler per core. + +// Integer physical register file has 96 registers of 64-bit. +def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>; + +// Unified Integer, Memory Scheduler has 40 entries. +def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> { + // Up to 4 IPC can be decoded, issued, retired. + let BufferSize = 40; +} + + +//===----------------------------------------------------------------------===// +// FPU Pipeline Scheduling +// + +// The FPU unit is shared between the two cores. + +// FP physical register file has 160 registers of 128-bit. +// Operations on 256-bit data types are cracked into two COPs. +def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>; + +// Unified FP Scheduler has 64 entries, +def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> { + // Up to 4 IPC can be decoded, issued, retired. + let BufferSize = 64; +} + + +//===----------------------------------------------------------------------===// +// Functional units +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Load-Store Units +// + +// FIXME: does this even make sense? + +def PdLoad : ProcResGroup<[PdAGLU01]> { + // For Piledriver, the load queue is 40 entries deep. + let BufferSize = 40; +} + +def PdStore : ProcResGroup<[PdAGLU01]> { + // For Piledriver, the store queue is 24 entries deep. + let BufferSize = 24; +} + +//===----------------------------------------------------------------------===// +// Integer Execution Units +// + +def PdDiv : ProcResource<1>; // PdEX0; unpipelined integer division +def PdCount : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT + +def PdMul : ProcResource<1>; // PdEX1; integer multiplication +def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches + +//===----------------------------------------------------------------------===// +// Floating-Point Units +// + +// Two FMAC/FPFMA units. +def PdFPFMA : ProcResource<2>; // PdFPU0, PdFPU1 + +// One 128-bit integer multiply-accumulate unit. +def PdFPMMA : ProcResource<1>; // PdFPU0 + +// One fp conversion unit. +def PdFPCVT : ProcResource<1>; // PdFPU0 + +// One unit for shuffles, packs, permutes, shifts. +def PdFPXBR : ProcResource<1>; // PdFPU1 + +// Two 128-bit packed integer units. +def PdFPMAL : ProcResource<2>; // PdFPU2, PdFPU3 + +// One FP store unit. +def PdFPSTO : ProcResource<1>; // PdFPU3 + + +//===----------------------------------------------------------------------===// +// Basic helper classes. +//===----------------------------------------------------------------------===// + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when dispatched by the schedulers. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass PdWriteRes ExePorts, int Lat = 1, + list Res = [], int UOps = 1> { + def : WriteRes { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } +} + +multiclass __pdWriteResPair ExePorts, int Lat, + list Res, int UOps, + int LoadLat, int LoadRes, int LoadUOps> { + defm : PdWriteRes; + + defm : PdWriteRes; +} + +multiclass PdWriteResExPair ExePorts, int Lat = 1, + list Res = [], int UOps = 1, + int ExtraLoadUOps = 0> { + defm : __pdWriteResPair; +} + +multiclass PdWriteResXMMPair ExePorts, int Lat = 1, + list Res = [], int UOps = 1, + int ExtraLoadUOps = 0> { + defm : __pdWriteResPair; +} + +multiclass PdWriteResYMMPair ExePorts, int Lat, + list Res, int UOps = 2, + int ExtraLoadUOps = 0> { + defm : __pdWriteResPair; +} + +//===----------------------------------------------------------------------===// +// Here be dragons. +//===----------------------------------------------------------------------===// + +// L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers +// needn't be available until 4 cycles after the memory operand. +def : ReadAdvance; + +// A folded store needs a cycle on the PdStore for the store data. +def : WriteRes; + +//////////////////////////////////////////////////////////////////////////////// +// Loads, stores, and moves, not folded with other operations. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes { let Latency = 5; } +def : WriteRes; +def : WriteRes; +def : WriteRes; + +// Load/store MXCSR. +// FIXME: These are copy and pasted from WriteLoad/Store. +def : WriteRes { let Latency = 5; } +def : WriteRes { let NumMicroOps = 2; } + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +//////////////////////////////////////////////////////////////////////////////// +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes; + +//////////////////////////////////////////////////////////////////////////////// +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResExPair; + +//////////////////////////////////////////////////////////////////////////////// +// Special case scheduling classes. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes { let Latency = 100; } +def : WriteRes { let Latency = 100; } +def : WriteRes; + +def PdWriteXLAT: SchedWriteRes<[PdEX01]> { + let Latency = 6; +} +def : InstRW<[PdWriteXLAT], (instrs XLAT)>; + +def PdWriteLARrr : SchedWriteRes<[PdEX01]> { + let Latency = 184; + let NumMicroOps = 45; +} +def: InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr", + "LSL(16|32|64)rr")>; + +// Nops don't have dependencies, so there's no actual latency, but we set this +// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. +def : WriteRes { let Latency = 1; } + +//////////////////////////////////////////////////////////////////////////////// +// Arithmetic. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResExPair; +defm : PdWriteResExPair; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; + +def PdWriteXCHG16rr: SchedWriteRes<[PdEX1]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>; + +def PdWriteXADD: SchedWriteRes<[PdEX1]> { + let Latency = 1; +} +def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>; + +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteRes; + +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; + +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; + +defm : PdWriteResExPair; + +def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> { + let Latency = 5; + let ResourceCycles = [4]; + let NumMicroOps = 5; +} +def: InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>; + +def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> { + let Latency = 6; + let ResourceCycles = [4]; + let NumMicroOps = 7; +} +def: InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>; + +def PdWriteCRC32r32r8 : SchedWriteRes<[PdEX01]> { + let Latency = 3; + let ResourceCycles = [4]; + let NumMicroOps = 3; +} +def: InstRW<[PdWriteCRC32r32r8], (instrs CRC32r32r8)>; + +def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> { + let Latency = 10; + let ResourceCycles = [4]; + let NumMicroOps = 11; +} +def: InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>; + +defm : PdWriteResExPair; // Conditional move. +defm : PdWriteResExPair; // Conditional (CF + ZF flag) move. +defm : PdWriteRes; // x87 conditional move. + +def : WriteRes { let Latency = 1; } // Setcc. +def : WriteRes; + +def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> { + let ResourceCycles = [2]; + let NumMicroOps = 2; +} +def: InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm, + SETLEm, SETLm)>; + +defm : PdWriteRes; + +def WriteLAHF : SchedWriteRes<[PdEX01]> { + let Latency = 2; + let NumMicroOps = 4; +} +def: InstRW<[WriteLAHF], (instrs LAHF)>; + +def WriteSAHF : SchedWriteRes<[PdEX01]> { + let Latency = 2; + let NumMicroOps = 2; +} +def: InstRW<[WriteSAHF], (instrs SAHF)>; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; + +// This is for simple LEAs with one or two input operands. +// FIXME: SAGU 3-operand LEA +def : WriteRes { let NumMicroOps = 2; } + +// Bit counts. +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; + +// BMI1 BEXTR, BMI2 BZHI +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; + +//////////////////////////////////////////////////////////////////////////////// +// Integer shifts and rotates. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; +defm : PdWriteResExPair; + +def PdWriteRCL8rCL: SchedWriteRes<[PdEX01]> { + let Latency = 12; + let NumMicroOps = 26; +} +def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>; + +def PdWriteRCR8ri: SchedWriteRes<[PdEX01]> { + let Latency = 12; + let NumMicroOps = 23; +} +def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>; + +def PdWriteRCR8rCL: SchedWriteRes<[PdEX01]> { + let Latency = 11; + let NumMicroOps = 24; +} +def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>; + +def PdWriteRCL16rCL: SchedWriteRes<[PdEX01]> { + let Latency = 10; + let NumMicroOps = 22; +} +def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>; + +def PdWriteRCR16ri: SchedWriteRes<[PdEX01]> { + let Latency = 10; + let NumMicroOps = 19; +} +def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>; + +def PdWriteRCL32rCLRCL64rCL: SchedWriteRes<[PdEX01]> { + let Latency = 7; + let NumMicroOps = 17; +} +def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>; + +def PdWriteRCR64rCL: SchedWriteRes<[PdEX01]> { + let Latency = 7; + let NumMicroOps = 16; +} +def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>; + +def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> { + let Latency = 7; + let NumMicroOps = 16; +} +def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>; + +def PdWriteRCR32riRCR64ri: SchedWriteRes<[PdEX01]> { + let Latency = 7; + let NumMicroOps = 15; +} +def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>; + + +def PdWriteRCR16rCL: SchedWriteRes<[PdEX01]> { + let Latency = 9; + let NumMicroOps = 20; +} +def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>; + +def PdWriteRCL16ri: SchedWriteRes<[PdEX01]> { + let Latency = 11; + let NumMicroOps = 21; +} +def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>; + +def PdWriteRCL3264ri: SchedWriteRes<[PdEX01]> { + let Latency = 8; + let NumMicroOps = 16; +} +def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>; + +def PdWriteRCL8ri: SchedWriteRes<[PdEX01]> { + let Latency = 13; + let NumMicroOps = 25; +} +def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>; + +// SHLD/SHRD. +defm : PdWriteRes; +defm : PdWriteRes; + +def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> { + let Latency = 3; + let ResourceCycles = [6]; + let NumMicroOps = 6; +} +def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>; + +def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> { + let Latency = 4; + let ResourceCycles = [8]; + let NumMicroOps = 7; +} +def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL, + SHLD32rrCL, + SHRD32rrCL)>; + +defm : PdWriteRes; +defm : PdWriteRes; + +//////////////////////////////////////////////////////////////////////////////// +// Floating point. This covers both scalar and vector operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +// WriteFMaskedLoadX +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +def PdWriteVMOVUPDYmrVMOVUPSYmr: SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { + let NumMicroOps = 8; +} +def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +// WriteFMaskedStoreX +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + + +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteDPPSX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +def PdWriteVDPPSrri: SchedWriteRes<[PdFPU1, PdFPFMA]> { + let Latency = 25; + let ResourceCycles = [1, 3]; + let NumMicroOps = 17; +} +def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteFRndX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> { + let Latency = 10; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr, + VFRCZSDrr, VFRCZSSrr)>; + +def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> { + let Latency = 10; + let ResourceCycles = [2, 1]; + let NumMicroOps = 4; +} +def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>; + +defm : PdWriteResXMMPair; +// WriteFLogicX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteFTestX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteFShuffleX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteFVarShuffleX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteFBlendX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteFVarBlendX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; + +def PdWriteVPERM2F128rr: SchedWriteRes<[PdFPU01, PdFPFMA]> { + let Latency = 4; + let NumMicroOps = 8; +} +def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>; + +def PdWriteVPERM2F128rm: SchedWriteRes<[PdFPU01, PdFPFMA]> { + let Latency = 8; // 4 + 4 + let NumMicroOps = 10; +} +def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>; + +//////////////////////////////////////////////////////////////////////////////// +// Conversions. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteCvtPS2IX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteCvtPD2IX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +// FIXME: f+3 ST, LD+STC latency +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteCvtI2PSX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; + +def WriteCVTSI642SDrr: SchedWriteRes<[PdFPU1, PdFPSTO]> { + let Latency = 13; + let NumMicroOps = 2; +} +def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>; + +defm : PdWriteResXMMPair; +// WriteCvtI2PDX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteCvtPS2PDX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteCvtPD2PSX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteCvtPH2PSX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +// WriteCvtPS2PHX +defm : PdWriteRes; +defm : X86WriteResUnsupported; + +defm : PdWriteRes; +// WriteCvtPS2PHXSt +defm : PdWriteRes; +defm : X86WriteResUnsupported; + +//////////////////////////////////////////////////////////////////////////////// +// Vector integer operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +// WriteVecLoadNTX +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +// WriteVecMaskedLoadX +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +def PdWriteVMOVDQUYmr: SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> { + let NumMicroOps = 8; +} +def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>; + +defm : PdWriteRes; +// WriteVecStoreNTX +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +// WriteVecMaskedStoreX +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +// defm : X86WriteResPairUnsupported; + +defm : PdWriteRes; +defm : PdWriteRes; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +def JWriteVPMACS: SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> { + let Latency = 4; + let ResourceCycles = [2, 1, 2, 1]; +} +def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr, + VPMACSSDQLrr)>; + +defm : PdWriteResXMMPair; +// WriteMPSADX +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteBlendX +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteVarBlendX +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +// WriteVecTestX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +// WriteVarVecShiftX +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +//////////////////////////////////////////////////////////////////////////////// +// Vector insert/extract operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteRes; +defm : PdWriteRes; + +defm : PdWriteRes; +defm : PdWriteRes; + +def PdWriteEXTRQ: SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 3; +} +def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>; + +//////////////////////////////////////////////////////////////////////////////// +// SSE42 String instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; + +//////////////////////////////////////////////////////////////////////////////// +// MOVMSK Instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteRes; + +defm : PdWriteRes; +// WriteVecMOVMSKX +defm : X86WriteResUnsupported; +// defm : X86WriteResUnsupported; + +defm : PdWriteRes; + +//////////////////////////////////////////////////////////////////////////////// +// AES Instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResXMMPair; +// WriteFHAddX +defm : PdWriteResYMMPair; +defm : X86WriteResPairUnsupported; + +defm : PdWriteResXMMPair; +defm : PdWriteResXMMPair; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; + +def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr, + PHADDWrr, PHSUBWrr, + PHADDSWrr, PHSUBSWrr, + VPHADDDrr, VPHSUBDrr, + VPHADDWrr, VPHSUBWrr, + VPHADDSWrr, VPHSUBSWrr)>; + +//////////////////////////////////////////////////////////////////////////////// +// Carry-less multiplication instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : PdWriteResXMMPair; + +def PdWriteVPCLMULQDQrr: SchedWriteRes<[PdFPU0, PdFPMMA]> { + let Latency = 12; + let NumMicroOps = 6; +} +def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>; + +//////////////////////////////////////////////////////////////////////////////// +// SSE4A instructions. +//////////////////////////////////////////////////////////////////////////////// + +def PdWriteINSERTQ: SchedWriteRes<[PdFPU01, PdFPMAL]> { + let Latency = 3; + let ResourceCycles = [1, 4]; +} +def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; + +//////////////////////////////////////////////////////////////////////////////// +// AVX instructions. +//////////////////////////////////////////////////////////////////////////////// + +def PdWriteVBROADCASTYLd: SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> { + let Latency = 6; + let ResourceCycles = [1, 2, 4]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm, + VBROADCASTSSYrm)>; + +def PdWriteVZEROALL: SchedWriteRes<[]> { + let Latency = 90; + let NumMicroOps = 32; +} +def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>; + +def PdWriteVZEROUPPER: SchedWriteRes<[]> { + let Latency = 46; + let NumMicroOps = 16; +} +def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>; + +/////////////////////////////////////////////////////////////////////////////// +// SchedWriteVariant definitions. +/////////////////////////////////////////////////////////////////////////////// + +def PdWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +def PdWriteZeroIdiom : SchedWriteVariant<[ + SchedVar, [PdWriteZeroLatency]>, + SchedVar, [WriteALU]> +]>; +def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def PdWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar, [PdWriteZeroLatency]>, + SchedVar, [WriteFLogic]> +]>; +def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, + XORPDrr, VXORPDrr, + ANDNPSrr, VANDNPSrr, + ANDNPDrr, VANDNPDrr)>; + +// VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1. + +def PdWriteVZeroIdiomLogic : SchedWriteVariant<[ + SchedVar, [PdWriteZeroLatency]>, + SchedVar, [WriteVecLogic]> +]>; +def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; + +def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar, [PdWriteZeroLatency]>, + SchedVar, [WriteVecLogicX]> +]>; +def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, + PANDNrr, VPANDNrr)>; + +def PdWriteVZeroIdiomALU : SchedWriteVariant<[ + SchedVar, [PdWriteZeroLatency]>, + SchedVar, [WriteVecALU]> +]>; +def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, + MMX_PSUBQirr, MMX_PSUBWirr, + MMX_PCMPGTBirr, + MMX_PCMPGTDirr, + MMX_PCMPGTWirr)>; + +def PdWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar, [PdWriteZeroLatency]>, + SchedVar, [WriteVecALUX]> +]>; +def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTWrr, VPCMPGTWrr)>; +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +// VPCMPGTQ, but not PCMPGTQ! + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, + + // MMX Zero-idioms. + DepBreakingClass<[ + MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, + MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, + MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, + MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr + ], ZeroIdiomPredicate>, + + // SSE Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, + + // int variants. + PXORrr, PANDNrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, + PCMPGTBrr, PCMPGTDrr, PCMPGTWrr + ], ZeroIdiomPredicate>, + + // AVX Zero-idioms. + DepBreakingClass<[ + // xmm fp variants. + VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, + + // xmm int variants. + VPXORrr, VPANDNrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + + // ymm variants. + VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr + ], ZeroIdiomPredicate> +]>; + +def : IsDepBreakingFunction<[ + // GPR + DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, + DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, + + // MMX + DepBreakingClass<[ + MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr + ], ZeroIdiomPredicate>, + + // SSE + DepBreakingClass<[ + PCMPEQBrr, PCMPEQWrr, PCMPEQDrr + // But not PCMPEQQrr. + ], ZeroIdiomPredicate>, + + // AVX + DepBreakingClass<[ + VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr + // But not VPCMPEQQrr. + ], ZeroIdiomPredicate> +]>; + + +} // SchedModel Index: test/CodeGen/X86/fma.ll =================================================================== --- test/CodeGen/X86/fma.ll +++ test/CodeGen/X86/fma.ll @@ -247,76 +247,6 @@ ; FMA32-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2 ; FMA32-NEXT: retl ## encoding: [0xc3] ; -; FMACALL32-LABEL: test_v4f32: -; FMACALL32: ## %bb.0: ## %entry -; FMACALL32-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] -; FMACALL32-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40] -; FMACALL32-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20] -; FMACALL32-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02] -; FMACALL32-NEXT: vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02] -; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] -; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x54] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] -; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] -; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] -; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c] -; FMACALL32-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] -; FMACALL32-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] -; FMACALL32-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] -; FMACALL32-NEXT: retl ## encoding: [0xc3] -; ; FMA64-LABEL: test_v4f32: ; FMA64: ## %bb.0: ## %entry ; FMA64-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2] @@ -407,6 +337,76 @@ ; AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2] ; AVX512VL-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512VL-NEXT: retq ## encoding: [0xc3] +; +; FMACALL32_BDVER2-LABEL: test_v4f32: +; FMACALL32_BDVER2: ## %bb.0: ## %entry +; FMACALL32_BDVER2-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x54] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] +; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) ret <4 x float> %call @@ -419,165 +419,6 @@ ; FMA32-NEXT: ## ymm0 = (ymm1 * ymm0) + ymm2 ; FMA32-NEXT: retl ## encoding: [0xc3] ; -; FMACALL32-LABEL: test_v8f32: -; FMACALL32: ## %bb.0: ## %entry -; FMACALL32-NEXT: subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00] -; FMACALL32-NEXT: ## imm = 0x13C -; FMACALL32-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] -; FMACALL32-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60] -; FMACALL32-NEXT: vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02] -; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] -; FMACALL32-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] -; FMACALL32-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; FMACALL32-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] -; FMACALL32-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x78] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x78] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] -; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] -; FMACALL32-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] -; FMACALL32-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] -; FMACALL32-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] -; FMACALL32-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] -; FMACALL32-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] -; FMACALL32-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] -; FMACALL32-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32-NEXT: addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00] -; FMACALL32-NEXT: ## imm = 0x13C -; FMACALL32-NEXT: retl ## encoding: [0xc3] -; ; FMA64-LABEL: test_v8f32: ; FMA64: ## %bb.0: ## %entry ; FMA64-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2] @@ -745,6 +586,165 @@ ; AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2] ; AVX512VL-NEXT: ## ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512VL-NEXT: retq ## encoding: [0xc3] +; +; FMACALL32_BDVER2-LABEL: test_v8f32: +; FMACALL32_BDVER2: ## %bb.0: ## %entry +; FMACALL32_BDVER2-NEXT: subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## imm = 0x13C +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] +; FMACALL32_BDVER2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] +; FMACALL32_BDVER2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] +; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32_BDVER2-NEXT: addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## imm = 0x13C +; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) ret <8 x float> %call @@ -765,321 +765,6 @@ ; FMA32-NEXT: popl %ebp ## encoding: [0x5d] ; FMA32-NEXT: retl ## encoding: [0xc3] ; -; FMACALL32-LABEL: test_v16f32: -; FMACALL32: ## %bb.0: ## %entry -; FMACALL32-NEXT: pushl %ebp ## encoding: [0x55] -; FMACALL32-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5] -; FMACALL32-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] -; FMACALL32-NEXT: subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00] -; FMACALL32-NEXT: ## imm = 0x1C0 -; FMACALL32-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] -; FMACALL32-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32-NEXT: vextractf128 $1, %ymm3, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd8,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xb0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] -; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] -; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] -; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x50] -; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x50] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] -; FMACALL32-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] -; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c] -; FMACALL32-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20] -; FMACALL32-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30] -; FMACALL32-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] -; FMACALL32-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10] -; FMACALL32-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20] -; FMACALL32-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30] -; FMACALL32-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c] -; FMACALL32-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10] -; FMACALL32-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20] -; FMACALL32-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30] -; FMACALL32-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c] -; FMACALL32-NEXT: ## xmm2 = mem[0],zero,zero,zero -; FMACALL32-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10] -; FMACALL32-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] -; FMACALL32-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20] -; FMACALL32-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; FMACALL32-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30] -; FMACALL32-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] -; FMACALL32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] -; FMACALL32-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] -; FMACALL32-NEXT: popl %ebp ## encoding: [0x5d] -; FMACALL32-NEXT: retl ## encoding: [0xc3] -; ; FMA64-LABEL: test_v16f32: ; FMA64: ## %bb.0: ## %entry ; FMA64-NEXT: vfmadd213ps %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0x6d,0xa8,0xc4] @@ -1378,6 +1063,321 @@ ; AVX512VL-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2] ; AVX512VL-NEXT: ## zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512VL-NEXT: retq ## encoding: [0xc3] +; +; FMACALL32_BDVER2-LABEL: test_v16f32: +; FMACALL32_BDVER2: ## %bb.0: ## %entry +; FMACALL32_BDVER2-NEXT: pushl %ebp ## encoding: [0x55] +; FMACALL32_BDVER2-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5] +; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] +; FMACALL32_BDVER2-NEXT: subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## imm = 0x1C0 +; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] +; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] +; FMACALL32_BDVER2-NEXT: popl %ebp ## encoding: [0x5d] +; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c) ret <16 x float> %call @@ -1390,41 +1390,6 @@ ; FMA32-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2 ; FMA32-NEXT: retl ## encoding: [0xc3] ; -; FMACALL32-LABEL: test_v2f64: -; FMACALL32: ## %bb.0: ## %entry -; FMACALL32-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] -; FMACALL32-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30] -; FMACALL32-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] -; FMACALL32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10] -; FMACALL32-NEXT: vmovlhps %xmm1, %xmm0, %xmm2 ## encoding: [0xc5,0xf8,0x16,0xd1] -; FMACALL32-NEXT: ## xmm2 = xmm0[0],xmm1[0] -; FMACALL32-NEXT: vmovups %xmm2, (%esp) ## encoding: [0xc5,0xf8,0x11,0x14,0x24] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30] -; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] -; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] -; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] -; FMACALL32-NEXT: ## xmm0 = mem[0],zero -; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] -; FMACALL32-NEXT: retl ## encoding: [0xc3] -; ; FMA64-LABEL: test_v2f64: ; FMA64: ## %bb.0: ## %entry ; FMA64-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] @@ -1477,6 +1442,41 @@ ; AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2] ; AVX512VL-NEXT: ## xmm0 = (xmm1 * xmm0) + xmm2 ; AVX512VL-NEXT: retq ## encoding: [0xc3] +; +; FMACALL32_BDVER2-LABEL: test_v2f64: +; FMACALL32_BDVER2: ## %bb.0: ## %entry +; FMACALL32_BDVER2-NEXT: subl $108, %esp ## encoding: [0x83,0xec,0x6c] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc1] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm1[0] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] +; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) ret <2 x double> %call @@ -1489,90 +1489,6 @@ ; FMA32-NEXT: ## ymm0 = (ymm1 * ymm0) + ymm2 ; FMA32-NEXT: retl ## encoding: [0xc3] ; -; FMACALL32-LABEL: test_v4f64: -; FMACALL32: ## %bb.0: ## %entry -; FMACALL32-NEXT: subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] -; FMACALL32-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70] -; FMACALL32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10] -; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] -; FMACALL32-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; FMACALL32-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x60] -; FMACALL32-NEXT: vmovlhps %xmm2, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc2] -; FMACALL32-NEXT: ## xmm0 = xmm1[0],xmm2[0] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x44] -; FMACALL32-NEXT: vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x38] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70] -; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] -; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x38] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x44] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] -; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30] -; FMACALL32-NEXT: ## xmm0 = mem[0],zero -; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20] -; FMACALL32-NEXT: ## xmm1 = mem[0],zero -; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18] -; FMACALL32-NEXT: ## xmm1 = xmm1[0],mem[0] -; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32-NEXT: addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00] -; FMACALL32-NEXT: retl ## encoding: [0xc3] -; ; FMA64-LABEL: test_v4f64: ; FMA64: ## %bb.0: ## %entry ; FMA64-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] @@ -1664,6 +1580,90 @@ ; AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2] ; AVX512VL-NEXT: ## ymm0 = (ymm1 * ymm0) + ymm2 ; AVX512VL-NEXT: retq ## encoding: [0xc3] +; +; FMACALL32_BDVER2-LABEL: test_v4f64: +; FMACALL32_BDVER2: ## %bb.0: ## %entry +; FMACALL32_BDVER2-NEXT: subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] +; FMACALL32_BDVER2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70] +; FMACALL32_BDVER2-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x44] +; FMACALL32_BDVER2-NEXT: vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],mem[1] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70] +; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x44] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32_BDVER2-NEXT: addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) ret <4 x double> %call @@ -1684,179 +1684,6 @@ ; FMA32-NEXT: popl %ebp ## encoding: [0x5d] ; FMA32-NEXT: retl ## encoding: [0xc3] ; -; FMACALL32-LABEL: test_v8f64: -; FMACALL32: ## %bb.0: ## %entry -; FMACALL32-NEXT: pushl %ebp ## encoding: [0x55] -; FMACALL32-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5] -; FMACALL32-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] -; FMACALL32-NEXT: subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00] -; FMACALL32-NEXT: ## imm = 0x180 -; FMACALL32-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] -; FMACALL32-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] -; FMACALL32-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],xmm2[0] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28] -; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] -; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30] -; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; FMACALL32-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20] -; FMACALL32-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1] -; FMACALL32-NEXT: ## xmm0 = xmm0[1],xmm1[1] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] -; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] -; FMACALL32-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] -; FMACALL32-NEXT: vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08] -; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20] -; FMACALL32-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] -; FMACALL32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] -; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00] -; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] -; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x20] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0x6c,0x24,0x30] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] -; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00] -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70] -; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] -; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60] -; FMACALL32-NEXT: ## xmm0 = mem[0],zero -; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58] -; FMACALL32-NEXT: ## xmm0 = xmm0[0],mem[0] -; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50] -; FMACALL32-NEXT: ## xmm1 = mem[0],zero -; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48] -; FMACALL32-NEXT: ## xmm1 = xmm1[0],mem[0] -; FMACALL32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] -; FMACALL32-NEXT: ## xmm1 = mem[0],zero -; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78] -; FMACALL32-NEXT: ## xmm1 = xmm1[0],mem[0] -; FMACALL32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70] -; FMACALL32-NEXT: ## xmm2 = mem[0],zero -; FMACALL32-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68] -; FMACALL32-NEXT: ## xmm2 = xmm2[0],mem[0] -; FMACALL32-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] -; FMACALL32-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] -; FMACALL32-NEXT: popl %ebp ## encoding: [0x5d] -; FMACALL32-NEXT: retl ## encoding: [0xc3] -; ; FMA64-LABEL: test_v8f64: ; FMA64: ## %bb.0: ## %entry ; FMA64-NEXT: vfmadd213pd %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0xed,0xa8,0xc4] @@ -2011,6 +1838,179 @@ ; AVX512VL-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2] ; AVX512VL-NEXT: ## zmm0 = (zmm1 * zmm0) + zmm2 ; AVX512VL-NEXT: retq ## encoding: [0xc3] +; +; FMACALL32_BDVER2-LABEL: test_v8f64: +; FMACALL32_BDVER2: ## %bb.0: ## %entry +; FMACALL32_BDVER2-NEXT: pushl %ebp ## encoding: [0x55] +; FMACALL32_BDVER2-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5] +; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] +; FMACALL32_BDVER2-NEXT: subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## imm = 0x180 +; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],mem[1] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],xmm1[1] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],mem[1] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x00,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70] +; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70] +; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0] +; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68] +; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero +; FMACALL32_BDVER2-NEXT: vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0] +; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] +; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] +; FMACALL32_BDVER2-NEXT: popl %ebp ## encoding: [0x5d] +; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] entry: %call = call <8 x double> @llvm.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c) ret <8 x double> %call Index: test/CodeGen/X86/lwp-intrinsics.ll =================================================================== --- test/CodeGen/X86/lwp-intrinsics.ll +++ test/CodeGen/X86/lwp-intrinsics.ll @@ -40,14 +40,41 @@ } define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind { -; X86-LABEL: test_lwpins32_rri: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF -; X86-NEXT: setb %al -; X86-NEXT: retl +; X86_BDVER1-LABEL: test_lwpins32_rri: +; X86_BDVER1: # %bb.0: +; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER1-NEXT: addl %ecx, %ecx +; X86_BDVER1-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF +; X86_BDVER1-NEXT: setb %al +; X86_BDVER1-NEXT: retl +; +; X86_BDVER2-LABEL: test_lwpins32_rri: +; X86_BDVER2: # %bb.0: +; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER2-NEXT: addl %ecx, %ecx +; X86_BDVER2-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF +; X86_BDVER2-NEXT: setb %al +; X86_BDVER2-NEXT: retl +; +; X86_BDVER3-LABEL: test_lwpins32_rri: +; X86_BDVER3: # %bb.0: +; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER3-NEXT: addl %ecx, %ecx +; X86_BDVER3-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF +; X86_BDVER3-NEXT: setb %al +; X86_BDVER3-NEXT: retl +; +; X86_BDVER4-LABEL: test_lwpins32_rri: +; X86_BDVER4: # %bb.0: +; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER4-NEXT: addl %ecx, %ecx +; X86_BDVER4-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF +; X86_BDVER4-NEXT: setb %al +; X86_BDVER4-NEXT: retl ; ; X64-LABEL: test_lwpins32_rri: ; X64: # %bb.0: @@ -80,13 +107,37 @@ } define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind { -; X86-LABEL: test_lwpval32_rri: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98 -; X86-NEXT: retl +; X86_BDVER1-LABEL: test_lwpval32_rri: +; X86_BDVER1: # %bb.0: +; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER1-NEXT: addl %ecx, %ecx +; X86_BDVER1-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98 +; X86_BDVER1-NEXT: retl +; +; X86_BDVER2-LABEL: test_lwpval32_rri: +; X86_BDVER2: # %bb.0: +; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER2-NEXT: addl %ecx, %ecx +; X86_BDVER2-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98 +; X86_BDVER2-NEXT: retl +; +; X86_BDVER3-LABEL: test_lwpval32_rri: +; X86_BDVER3: # %bb.0: +; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER3-NEXT: addl %ecx, %ecx +; X86_BDVER3-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98 +; X86_BDVER3-NEXT: retl +; +; X86_BDVER4-LABEL: test_lwpval32_rri: +; X86_BDVER4: # %bb.0: +; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86_BDVER4-NEXT: addl %ecx, %ecx +; X86_BDVER4-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98 +; X86_BDVER4-NEXT: retl ; ; X64-LABEL: test_lwpval32_rri: ; X64: # %bb.0: Index: test/CodeGen/X86/lwp-schedule.ll =================================================================== --- test/CodeGen/X86/lwp-schedule.ll +++ test/CodeGen/X86/lwp-schedule.ll @@ -11,10 +11,25 @@ ; GENERIC-NEXT: llwpcb %rdi # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_llwpcb: -; BDVER: # %bb.0: -; BDVER-NEXT: llwpcb %rdi -; BDVER-NEXT: retq +; BDVER1-LABEL: test_llwpcb: +; BDVER1: # %bb.0: +; BDVER1-NEXT: llwpcb %rdi # sched: [100:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_llwpcb: +; BDVER2: # %bb.0: +; BDVER2-NEXT: llwpcb %rdi # sched: [100:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_llwpcb: +; BDVER3: # %bb.0: +; BDVER3-NEXT: llwpcb %rdi +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_llwpcb: +; BDVER4: # %bb.0: +; BDVER4-NEXT: llwpcb %rdi +; BDVER4-NEXT: retq tail call void @llvm.x86.llwpcb(i8 *%a0) ret void } @@ -25,10 +40,25 @@ ; GENERIC-NEXT: slwpcb %rax # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_slwpcb: -; BDVER: # %bb.0: -; BDVER-NEXT: slwpcb %rax -; BDVER-NEXT: retq +; BDVER1-LABEL: test_slwpcb: +; BDVER1: # %bb.0: +; BDVER1-NEXT: slwpcb %rax # sched: [100:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_slwpcb: +; BDVER2: # %bb.0: +; BDVER2-NEXT: slwpcb %rax # sched: [100:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_slwpcb: +; BDVER3: # %bb.0: +; BDVER3-NEXT: slwpcb %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_slwpcb: +; BDVER4: # %bb.0: +; BDVER4-NEXT: slwpcb %rax +; BDVER4-NEXT: retq %1 = tail call i8* @llvm.x86.slwpcb() ret i8 *%1 } @@ -42,12 +72,35 @@ ; GENERIC-NEXT: setb %al # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpins32_rri: -; BDVER: # %bb.0: -; BDVER-NEXT: addl %esi, %esi -; BDVER-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF -; BDVER-NEXT: setb %al -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpins32_rri: +; BDVER1: # %bb.0: +; BDVER1-NEXT: addl %esi, %esi # sched: [1:0.50] +; BDVER1-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: setb %al # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpins32_rri: +; BDVER2: # %bb.0: +; BDVER2-NEXT: addl %esi, %esi # sched: [1:0.50] +; BDVER2-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: setb %al # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpins32_rri: +; BDVER3: # %bb.0: +; BDVER3-NEXT: addl %esi, %esi +; BDVER3-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF +; BDVER3-NEXT: setb %al +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpins32_rri: +; BDVER4: # %bb.0: +; BDVER4-NEXT: addl %esi, %esi +; BDVER4-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF +; BDVER4-NEXT: setb %al +; BDVER4-NEXT: retq %1 = add i32 %a1, %a1 %2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %1, i32 2309737967) ret i8 %2 @@ -61,11 +114,31 @@ ; GENERIC-NEXT: setb %al # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpins32_rmi: -; BDVER: # %bb.0: -; BDVER-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210 -; BDVER-NEXT: setb %al -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpins32_rmi: +; BDVER1: # %bb.0: +; BDVER1-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210 +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: setb %al # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpins32_rmi: +; BDVER2: # %bb.0: +; BDVER2-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210 +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: setb %al # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpins32_rmi: +; BDVER3: # %bb.0: +; BDVER3-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210 +; BDVER3-NEXT: setb %al +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpins32_rmi: +; BDVER4: # %bb.0: +; BDVER4-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210 +; BDVER4-NEXT: setb %al +; BDVER4-NEXT: retq %a1 = load i32, i32 *%p1 %1 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 1985229328) ret i8 %1 @@ -79,11 +152,31 @@ ; GENERIC-NEXT: setb %al # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpins64_rri: -; BDVER: # %bb.0: -; BDVER-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF -; BDVER-NEXT: setb %al -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpins64_rri: +; BDVER1: # %bb.0: +; BDVER1-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: setb %al # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpins64_rri: +; BDVER2: # %bb.0: +; BDVER2-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: setb %al # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpins64_rri: +; BDVER3: # %bb.0: +; BDVER3-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF +; BDVER3-NEXT: setb %al +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpins64_rri: +; BDVER4: # %bb.0: +; BDVER4-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF +; BDVER4-NEXT: setb %al +; BDVER4-NEXT: retq %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2309737967) ret i8 %1 } @@ -96,11 +189,31 @@ ; GENERIC-NEXT: setb %al # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpins64_rmi: -; BDVER: # %bb.0: -; BDVER-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210 -; BDVER-NEXT: setb %al -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpins64_rmi: +; BDVER1: # %bb.0: +; BDVER1-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210 +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: setb %al # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpins64_rmi: +; BDVER2: # %bb.0: +; BDVER2-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210 +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: setb %al # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpins64_rmi: +; BDVER3: # %bb.0: +; BDVER3-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210 +; BDVER3-NEXT: setb %al +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpins64_rmi: +; BDVER4: # %bb.0: +; BDVER4-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210 +; BDVER4-NEXT: setb %al +; BDVER4-NEXT: retq %a1 = load i32, i32 *%p1 %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 1985229328) ret i8 %1 @@ -114,11 +227,31 @@ ; GENERIC-NEXT: # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpval32_rri: -; BDVER: # %bb.0: -; BDVER-NEXT: addl %esi, %esi -; BDVER-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98 -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpval32_rri: +; BDVER1: # %bb.0: +; BDVER1-NEXT: addl %esi, %esi # sched: [1:0.50] +; BDVER1-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98 +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpval32_rri: +; BDVER2: # %bb.0: +; BDVER2-NEXT: addl %esi, %esi # sched: [1:0.50] +; BDVER2-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98 +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpval32_rri: +; BDVER3: # %bb.0: +; BDVER3-NEXT: addl %esi, %esi +; BDVER3-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98 +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpval32_rri: +; BDVER4: # %bb.0: +; BDVER4-NEXT: addl %esi, %esi +; BDVER4-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98 +; BDVER4-NEXT: retq %1 = add i32 %a1, %a1 tail call void @llvm.x86.lwpval32(i32 %a0, i32 %1, i32 4275878552) ret void @@ -131,10 +264,27 @@ ; GENERIC-NEXT: # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpval32_rmi: -; BDVER: # %bb.0: -; BDVER-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678 -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpval32_rmi: +; BDVER1: # %bb.0: +; BDVER1-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678 +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpval32_rmi: +; BDVER2: # %bb.0: +; BDVER2-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678 +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpval32_rmi: +; BDVER3: # %bb.0: +; BDVER3-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678 +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpval32_rmi: +; BDVER4: # %bb.0: +; BDVER4-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678 +; BDVER4-NEXT: retq %a1 = load i32, i32 *%p1 tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 305419896) ret void @@ -147,10 +297,27 @@ ; GENERIC-NEXT: # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpval64_rri: -; BDVER: # %bb.0: -; BDVER-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98 -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpval64_rri: +; BDVER1: # %bb.0: +; BDVER1-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98 +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpval64_rri: +; BDVER2: # %bb.0: +; BDVER2-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98 +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpval64_rri: +; BDVER3: # %bb.0: +; BDVER3-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98 +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpval64_rri: +; BDVER4: # %bb.0: +; BDVER4-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98 +; BDVER4-NEXT: retq tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 4275878552) ret void } @@ -162,10 +329,27 @@ ; GENERIC-NEXT: # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_lwpval64_rmi: -; BDVER: # %bb.0: -; BDVER-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678 -; BDVER-NEXT: retq +; BDVER1-LABEL: test_lwpval64_rmi: +; BDVER1: # %bb.0: +; BDVER1-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678 +; BDVER1-NEXT: # sched: [100:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] +; +; BDVER2-LABEL: test_lwpval64_rmi: +; BDVER2: # %bb.0: +; BDVER2-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678 +; BDVER2-NEXT: # sched: [100:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_lwpval64_rmi: +; BDVER3: # %bb.0: +; BDVER3-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678 +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_lwpval64_rmi: +; BDVER4: # %bb.0: +; BDVER4-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678 +; BDVER4-NEXT: retq %a1 = load i32, i32 *%p1 tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 305419896) ret void Index: test/CodeGen/X86/memset.ll =================================================================== --- test/CodeGen/X86/memset.ll +++ test/CodeGen/X86/memset.ll @@ -22,7 +22,6 @@ ; X86-NEXT: calll _foo ; X86-NEXT: addl $44, %esp ; X86-NEXT: retl -; X86-NEXT: ## -- End function ; ; XMM-LABEL: t: ; XMM: ## %bb.0: ## %entry @@ -35,7 +34,6 @@ ; XMM-NEXT: calll _foo ; XMM-NEXT: addl $60, %esp ; XMM-NEXT: retl -; XMM-NEXT: ## -- End function ; ; YMM-LABEL: t: ; YMM: ## %bb.0: ## %entry @@ -44,15 +42,14 @@ ; YMM-NEXT: andl $-32, %esp ; YMM-NEXT: subl $96, %esp ; YMM-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; YMM-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) ; YMM-NEXT: leal {{[0-9]+}}(%esp), %eax +; YMM-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) ; YMM-NEXT: movl %eax, (%esp) ; YMM-NEXT: vzeroupper ; YMM-NEXT: calll _foo ; YMM-NEXT: movl %ebp, %esp ; YMM-NEXT: popl %ebp ; YMM-NEXT: retl -; YMM-NEXT: ## -- End function entry: %up_mvd = alloca [8 x %struct.x] ; <[8 x %struct.x]*> [#uses=2] %up_mvd116 = getelementptr [8 x %struct.x], [8 x %struct.x]* %up_mvd, i32 0, i32 0 ; <%struct.x*> [#uses=1] Index: test/CodeGen/X86/schedule-x86-64-shld.ll =================================================================== --- test/CodeGen/X86/schedule-x86-64-shld.ll +++ test/CodeGen/X86/schedule-x86-64-shld.ll @@ -24,9 +24,9 @@ ; ; BDVER1-LABEL: lshift10_optsize: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rdi, %rax -; BDVER1-NEXT: shldq $10, %rsi, %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: movq %rdi, %rax # sched: [1:0.50] +; BDVER1-NEXT: shldq $10, %rsi, %rax # sched: [4:3.00] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shl = shl i64 %a, 10 %shr = lshr i64 %b, 54 @@ -50,10 +50,10 @@ ; ; BDVER1-LABEL: lshift10: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: shlq $10, %rdi -; BDVER1-NEXT: shrq $54, %rsi -; BDVER1-NEXT: leaq (%rsi,%rdi), %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: shlq $10, %rdi # sched: [1:0.50] +; BDVER1-NEXT: shrq $54, %rsi # sched: [1:0.50] +; BDVER1-NEXT: leaq (%rsi,%rdi), %rax # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shl = shl i64 %a, 10 %shr = lshr i64 %b, 54 @@ -82,9 +82,9 @@ ; ; BDVER1-LABEL: rshift10_optsize: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rdi, %rax -; BDVER1-NEXT: shrdq $62, %rsi, %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: movq %rdi, %rax # sched: [1:0.50] +; BDVER1-NEXT: shrdq $62, %rsi, %rax # sched: [4:3.00] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shl = lshr i64 %a, 62 %shr = shl i64 %b, 2 @@ -108,9 +108,9 @@ ; ; BDVER1-LABEL: rshift10: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: shrq $62, %rdi -; BDVER1-NEXT: leaq (%rdi,%rsi,4), %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: shrq $62, %rdi # sched: [1:0.50] +; BDVER1-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shl = lshr i64 %a, 62 %shr = shl i64 %b, 2 @@ -142,11 +142,11 @@ ; ; BDVER1-LABEL: lshift_cl_optsize: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rdx, %rcx -; BDVER1-NEXT: movq %rdi, %rax +; BDVER1-NEXT: movq %rdx, %rcx # sched: [1:0.50] +; BDVER1-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BDVER1-NEXT: # kill: def $cl killed $cl killed $rcx -; BDVER1-NEXT: shldq %cl, %rsi, %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: shldq %cl, %rsi, %rax # sched: [4:4.00] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shl = shl i64 %a, %c %sub = sub nsw i64 64, %c @@ -177,14 +177,14 @@ ; ; BDVER1-LABEL: lshift_cl: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rdx, %rcx -; BDVER1-NEXT: movq %rsi, %rax -; BDVER1-NEXT: shlq %cl, %rdi -; BDVER1-NEXT: negl %ecx +; BDVER1-NEXT: movq %rdx, %rcx # sched: [1:0.50] +; BDVER1-NEXT: movq %rsi, %rax # sched: [1:0.50] +; BDVER1-NEXT: shlq %cl, %rdi # sched: [1:0.50] +; BDVER1-NEXT: negl %ecx # sched: [1:0.50] ; BDVER1-NEXT: # kill: def $cl killed $cl killed $rcx -; BDVER1-NEXT: shrq %cl, %rax -; BDVER1-NEXT: orq %rdi, %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: shrq %cl, %rax # sched: [1:0.50] +; BDVER1-NEXT: orq %rdi, %rax # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shl = shl i64 %a, %c %sub = sub nsw i64 64, %c @@ -218,11 +218,11 @@ ; ; BDVER1-LABEL: rshift_cl_optsize: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rdx, %rcx -; BDVER1-NEXT: movq %rdi, %rax +; BDVER1-NEXT: movq %rdx, %rcx # sched: [1:0.50] +; BDVER1-NEXT: movq %rdi, %rax # sched: [1:0.50] ; BDVER1-NEXT: # kill: def $cl killed $cl killed $rcx -; BDVER1-NEXT: shrdq %cl, %rsi, %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: shrdq %cl, %rsi, %rax # sched: [4:4.00] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shr = lshr i64 %a, %c %sub = sub nsw i64 64, %c @@ -253,14 +253,14 @@ ; ; BDVER1-LABEL: rshift_cl: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rdx, %rcx -; BDVER1-NEXT: movq %rsi, %rax -; BDVER1-NEXT: shrq %cl, %rdi -; BDVER1-NEXT: negl %ecx +; BDVER1-NEXT: movq %rdx, %rcx # sched: [1:0.50] +; BDVER1-NEXT: movq %rsi, %rax # sched: [1:0.50] +; BDVER1-NEXT: shrq %cl, %rdi # sched: [1:0.50] +; BDVER1-NEXT: negl %ecx # sched: [1:0.50] ; BDVER1-NEXT: # kill: def $cl killed $cl killed $rcx -; BDVER1-NEXT: shlq %cl, %rax -; BDVER1-NEXT: orq %rdi, %rax -; BDVER1-NEXT: retq +; BDVER1-NEXT: shlq %cl, %rax # sched: [1:0.50] +; BDVER1-NEXT: orq %rdi, %rax # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %shr = lshr i64 %a, %c %sub = sub nsw i64 64, %c @@ -293,10 +293,10 @@ ; ; BDVER1-LABEL: lshift_mem_cl_optsize: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rsi, %rcx +; BDVER1-NEXT: movq %rsi, %rcx # sched: [1:0.50] ; BDVER1-NEXT: # kill: def $cl killed $cl killed $rcx -; BDVER1-NEXT: shldq %cl, %rdi, {{.*}}(%rip) -; BDVER1-NEXT: retq +; BDVER1-NEXT: shldq %cl, %rdi, {{.*}}(%rip) # sched: [4:11.00] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %b = load i64, i64* @x %shl = shl i64 %b, %c @@ -329,15 +329,15 @@ ; ; BDVER1-LABEL: lshift_mem_cl: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq %rsi, %rcx -; BDVER1-NEXT: movq {{.*}}(%rip), %rax -; BDVER1-NEXT: shlq %cl, %rax -; BDVER1-NEXT: negl %ecx +; BDVER1-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; BDVER1-NEXT: movq %rsi, %rcx # sched: [1:0.50] +; BDVER1-NEXT: shlq %cl, %rax # sched: [1:0.50] +; BDVER1-NEXT: negl %ecx # sched: [1:0.50] ; BDVER1-NEXT: # kill: def $cl killed $cl killed $rcx -; BDVER1-NEXT: shrq %cl, %rdi -; BDVER1-NEXT: orq %rax, %rdi -; BDVER1-NEXT: movq %rdi, {{.*}}(%rip) -; BDVER1-NEXT: retq +; BDVER1-NEXT: shrq %cl, %rdi # sched: [1:0.50] +; BDVER1-NEXT: orq %rax, %rdi # sched: [1:0.50] +; BDVER1-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %b = load i64, i64* @x %shl = shl i64 %b, %c @@ -365,12 +365,12 @@ ; ; BDVER1-LABEL: lshift_mem: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq {{.*}}(%rip), %rax -; BDVER1-NEXT: shlq $10, %rax -; BDVER1-NEXT: shrq $54, %rdi -; BDVER1-NEXT: orq %rax, %rdi -; BDVER1-NEXT: movq %rdi, {{.*}}(%rip) -; BDVER1-NEXT: retq +; BDVER1-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; BDVER1-NEXT: shrq $54, %rdi # sched: [1:0.50] +; BDVER1-NEXT: shlq $10, %rax # sched: [1:0.50] +; BDVER1-NEXT: orq %rax, %rdi # sched: [1:0.50] +; BDVER1-NEXT: movq %rdi, {{.*}}(%rip) # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %b = load i64, i64* @x %shl = shl i64 %b, 10 @@ -393,8 +393,8 @@ ; ; BDVER1-LABEL: lshift_mem_optsize: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: shldq $10, %rdi, {{.*}}(%rip) -; BDVER1-NEXT: retq +; BDVER1-NEXT: shldq $10, %rdi, {{.*}}(%rip) # sched: [4:11.00] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %b = load i64, i64* @x %shl = shl i64 %b, 10 @@ -423,12 +423,12 @@ ; ; BDVER1-LABEL: lshift_mem_b: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq {{.*}}(%rip), %rax -; BDVER1-NEXT: shlq $10, %rdi -; BDVER1-NEXT: shrq $54, %rax -; BDVER1-NEXT: orq %rdi, %rax -; BDVER1-NEXT: movq %rax, {{.*}}(%rip) -; BDVER1-NEXT: retq +; BDVER1-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; BDVER1-NEXT: shlq $10, %rdi # sched: [1:0.50] +; BDVER1-NEXT: shrq $54, %rax # sched: [1:0.50] +; BDVER1-NEXT: orq %rdi, %rax # sched: [1:0.50] +; BDVER1-NEXT: movq %rax, {{.*}}(%rip) # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %a = load i64, i64* @x %shl = shl i64 %b, 10 @@ -455,10 +455,10 @@ ; ; BDVER1-LABEL: lshift_mem_b_optsize: ; BDVER1: # %bb.0: # %entry -; BDVER1-NEXT: movq {{.*}}(%rip), %rax -; BDVER1-NEXT: shrdq $54, %rdi, %rax -; BDVER1-NEXT: movq %rax, {{.*}}(%rip) -; BDVER1-NEXT: retq +; BDVER1-NEXT: movq {{.*}}(%rip), %rax # sched: [5:0.50] +; BDVER1-NEXT: shrdq $54, %rdi, %rax # sched: [4:3.00] +; BDVER1-NEXT: movq %rax, {{.*}}(%rip) # sched: [1:0.50] +; BDVER1-NEXT: retq # sched: [5:1.00] entry: %a = load i64, i64* @x %shl = shl i64 %b, 10 Index: test/CodeGen/X86/tbm-schedule.ll =================================================================== --- test/CodeGen/X86/tbm-schedule.ll +++ test/CodeGen/X86/tbm-schedule.ll @@ -14,12 +14,28 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_bextri_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 -; BDVER-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_bextri_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 +; BDVER2-NEXT: # sched: [6:0.50] +; BDVER2-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; BDVER2-NEXT: # sched: [2:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_bextri_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; BDVER3-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_bextri_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; BDVER4-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = lshr i32 %a0, 4 %m0 = lshr i32 %a1, 4 @@ -39,12 +55,28 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_bextri_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 -; BDVER-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_bextri_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 +; BDVER2-NEXT: # sched: [6:0.50] +; BDVER2-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; BDVER2-NEXT: # sched: [2:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_bextri_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; BDVER3-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_bextri_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; BDVER4-NEXT: bextrl $3076, (%rsi), %eax # imm = 0xC04 +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = lshr i64 %a0, 4 %m0 = lshr i64 %a1, 4 @@ -62,12 +94,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcfill_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: blcfilll %edi, %ecx -; BDVER-NEXT: blcfilll (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcfill_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcfilll (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: blcfilll %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcfill_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcfilll %edi, %ecx +; BDVER3-NEXT: blcfilll (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcfill_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcfilll %edi, %ecx +; BDVER4-NEXT: blcfilll (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = add i32 %a0, 1 %m0 = add i32 %a1, 1 @@ -85,12 +131,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcfill_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: blcfillq %rdi, %rcx -; BDVER-NEXT: blcfillq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcfill_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcfillq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: blcfillq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcfill_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcfillq %rdi, %rcx +; BDVER3-NEXT: blcfillq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcfill_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcfillq %rdi, %rcx +; BDVER4-NEXT: blcfillq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = add i64 %a0, 1 %m0 = add i64 %a1, 1 @@ -108,12 +168,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blci_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: blcil %edi, %ecx -; BDVER-NEXT: blcil (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blci_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcil (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: blcil %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blci_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcil %edi, %ecx +; BDVER3-NEXT: blcil (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blci_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcil %edi, %ecx +; BDVER4-NEXT: blcil (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = add i32 1, %a0 %m0 = add i32 1, %a1 @@ -133,12 +207,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blci_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: blciq %rdi, %rcx -; BDVER-NEXT: blciq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blci_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blciq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: blciq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blci_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blciq %rdi, %rcx +; BDVER3-NEXT: blciq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blci_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blciq %rdi, %rcx +; BDVER4-NEXT: blciq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = add i64 1, %a0 %m0 = add i64 1, %a1 @@ -158,12 +246,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcic_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: blcicl %edi, %ecx -; BDVER-NEXT: blcicl (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcic_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcicl (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: blcicl %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcic_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcicl %edi, %ecx +; BDVER3-NEXT: blcicl (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcic_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcicl %edi, %ecx +; BDVER4-NEXT: blcicl (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = xor i32 %a0, -1 %m0 = xor i32 %a1, -1 @@ -183,12 +285,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcic_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: blcicq %rdi, %rcx -; BDVER-NEXT: blcicq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcic_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcicq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: blcicq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcic_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcicq %rdi, %rcx +; BDVER3-NEXT: blcicq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcic_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcicq %rdi, %rcx +; BDVER4-NEXT: blcicq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = xor i64 %a0, -1 %m0 = xor i64 %a1, -1 @@ -208,12 +324,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcmsk_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: blcmskl %edi, %ecx -; BDVER-NEXT: blcmskl (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcmsk_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcmskl (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: blcmskl %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcmsk_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcmskl %edi, %ecx +; BDVER3-NEXT: blcmskl (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcmsk_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcmskl %edi, %ecx +; BDVER4-NEXT: blcmskl (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = add i32 %a0, 1 %m0 = add i32 %a1, 1 @@ -231,12 +361,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcmsk_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: blcmskq %rdi, %rcx -; BDVER-NEXT: blcmskq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcmsk_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcmskq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: blcmskq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcmsk_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcmskq %rdi, %rcx +; BDVER3-NEXT: blcmskq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcmsk_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcmskq %rdi, %rcx +; BDVER4-NEXT: blcmskq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = add i64 %a0, 1 %m0 = add i64 %a1, 1 @@ -254,12 +398,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcs_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: blcsl %edi, %ecx -; BDVER-NEXT: blcsl (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcs_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcsl (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: blcsl %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcs_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcsl %edi, %ecx +; BDVER3-NEXT: blcsl (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcs_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcsl %edi, %ecx +; BDVER4-NEXT: blcsl (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = add i32 %a0, 1 %m0 = add i32 %a1, 1 @@ -277,12 +435,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blcs_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: blcsq %rdi, %rcx -; BDVER-NEXT: blcsq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blcs_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blcsq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: blcsq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blcs_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blcsq %rdi, %rcx +; BDVER3-NEXT: blcsq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blcs_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blcsq %rdi, %rcx +; BDVER4-NEXT: blcsq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = add i64 %a0, 1 %m0 = add i64 %a1, 1 @@ -300,12 +472,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blsfill_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: blsfilll %edi, %ecx -; BDVER-NEXT: blsfilll (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blsfill_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blsfilll (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: blsfilll %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blsfill_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blsfilll %edi, %ecx +; BDVER3-NEXT: blsfilll (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blsfill_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blsfilll %edi, %ecx +; BDVER4-NEXT: blsfilll (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = add i32 %a0, -1 %m0 = add i32 %a1, -1 @@ -323,12 +509,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blsfill_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: blsfillq %rdi, %rcx -; BDVER-NEXT: blsfillq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blsfill_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blsfillq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: blsfillq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blsfill_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blsfillq %rdi, %rcx +; BDVER3-NEXT: blsfillq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blsfill_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blsfillq %rdi, %rcx +; BDVER4-NEXT: blsfillq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = add i64 %a0, -1 %m0 = add i64 %a1, -1 @@ -346,12 +546,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blsic_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: blsicl %edi, %ecx -; BDVER-NEXT: blsicl (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blsic_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blsicl (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: blsicl %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blsic_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blsicl %edi, %ecx +; BDVER3-NEXT: blsicl (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blsic_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blsicl %edi, %ecx +; BDVER4-NEXT: blsicl (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = xor i32 %a0, -1 %m0 = xor i32 %a1, -1 @@ -371,12 +585,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_blsic_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: blsicq %rdi, %rcx -; BDVER-NEXT: blsicq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_blsic_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: blsicq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: blsicq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_blsic_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: blsicq %rdi, %rcx +; BDVER3-NEXT: blsicq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_blsic_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: blsicq %rdi, %rcx +; BDVER4-NEXT: blsicq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = xor i64 %a0, -1 %m0 = xor i64 %a1, -1 @@ -396,12 +624,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_t1mskc_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: t1mskcl %edi, %ecx -; BDVER-NEXT: t1mskcl (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_t1mskc_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: t1mskcl (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: t1mskcl %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_t1mskc_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: t1mskcl %edi, %ecx +; BDVER3-NEXT: t1mskcl (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_t1mskc_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: t1mskcl %edi, %ecx +; BDVER4-NEXT: t1mskcl (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = xor i32 %a0, -1 %m0 = xor i32 %a1, -1 @@ -421,12 +663,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_t1mskc_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: t1mskcq %rdi, %rcx -; BDVER-NEXT: t1mskcq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_t1mskc_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: t1mskcq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: t1mskcq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_t1mskc_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: t1mskcq %rdi, %rcx +; BDVER3-NEXT: t1mskcq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_t1mskc_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: t1mskcq %rdi, %rcx +; BDVER4-NEXT: t1mskcq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = xor i64 %a0, -1 %m0 = xor i64 %a1, -1 @@ -446,12 +702,26 @@ ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_tzmsk_u32: -; BDVER: # %bb.0: -; BDVER-NEXT: tzmskl %edi, %ecx -; BDVER-NEXT: tzmskl (%rsi), %eax -; BDVER-NEXT: addl %ecx, %eax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_tzmsk_u32: +; BDVER2: # %bb.0: +; BDVER2-NEXT: tzmskl (%rsi), %eax # sched: [5:0.50] +; BDVER2-NEXT: tzmskl %edi, %ecx # sched: [1:0.50] +; BDVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_tzmsk_u32: +; BDVER3: # %bb.0: +; BDVER3-NEXT: tzmskl %edi, %ecx +; BDVER3-NEXT: tzmskl (%rsi), %eax +; BDVER3-NEXT: addl %ecx, %eax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_tzmsk_u32: +; BDVER4: # %bb.0: +; BDVER4-NEXT: tzmskl %edi, %ecx +; BDVER4-NEXT: tzmskl (%rsi), %eax +; BDVER4-NEXT: addl %ecx, %eax +; BDVER4-NEXT: retq %a1 = load i32, i32* %p1 %r0 = xor i32 %a0, -1 %m0 = xor i32 %a1, -1 @@ -471,12 +741,26 @@ ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; -; BDVER-LABEL: test_x86_tbm_tzmsk_u64: -; BDVER: # %bb.0: -; BDVER-NEXT: tzmskq %rdi, %rcx -; BDVER-NEXT: tzmskq (%rsi), %rax -; BDVER-NEXT: addq %rcx, %rax -; BDVER-NEXT: retq +; BDVER2-LABEL: test_x86_tbm_tzmsk_u64: +; BDVER2: # %bb.0: +; BDVER2-NEXT: tzmskq (%rsi), %rax # sched: [5:0.50] +; BDVER2-NEXT: tzmskq %rdi, %rcx # sched: [1:0.50] +; BDVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BDVER2-NEXT: retq # sched: [5:1.00] +; +; BDVER3-LABEL: test_x86_tbm_tzmsk_u64: +; BDVER3: # %bb.0: +; BDVER3-NEXT: tzmskq %rdi, %rcx +; BDVER3-NEXT: tzmskq (%rsi), %rax +; BDVER3-NEXT: addq %rcx, %rax +; BDVER3-NEXT: retq +; +; BDVER4-LABEL: test_x86_tbm_tzmsk_u64: +; BDVER4: # %bb.0: +; BDVER4-NEXT: tzmskq %rdi, %rcx +; BDVER4-NEXT: tzmskq (%rsi), %rax +; BDVER4-NEXT: addq %rcx, %rax +; BDVER4-NEXT: retq %a1 = load i64, i64* %p1 %r0 = xor i64 %a0, -1 %m0 = xor i64 %a1, -1 Index: test/CodeGen/X86/wide-fma-contraction.ll =================================================================== --- test/CodeGen/X86/wide-fma-contraction.ll +++ test/CodeGen/X86/wide-fma-contraction.ll @@ -30,8 +30,8 @@ ; CHECK-NOFMA-NEXT: andl $-32, %esp ; CHECK-NOFMA-NEXT: subl $32, %esp ; CHECK-NOFMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; CHECK-NOFMA-NEXT: vaddps 8(%ebp), %ymm0, %ymm0 ; CHECK-NOFMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; CHECK-NOFMA-NEXT: vaddps 8(%ebp), %ymm0, %ymm0 ; CHECK-NOFMA-NEXT: vaddps 40(%ebp), %ymm1, %ymm1 ; CHECK-NOFMA-NEXT: movl %ebp, %esp ; CHECK-NOFMA-NEXT: popl %ebp