Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -587,8 +587,7 @@ FeatureFastVectorFSQRT ]>; -// FIXME: define SKL model -class SkylakeClientProc : ProcModel : ProcModel; def : SkylakeClientProc<"skylake">; @@ -623,7 +622,7 @@ ]>; // FIXME: define SKX model -class SkylakeServerProc : ProcModel : ProcModel; def : SkylakeServerProc<"skylake-avx512">; def : SkylakeServerProc<"skx">; // Legacy alias. Index: lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- /dev/null +++ lib/Target/X86/X86SchedSkylakeClient.td @@ -0,0 +1,4108 @@ +//=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Skylake Client to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SkylakeClientModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SKylake can + // decode 6 instructions per cycle. + let IssueWidth = 6; + let MicroOpBufferSize = 224; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 14; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SkylakeClientModel in { + +// Skylake Client can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def SKLPort0 : ProcResource<1>; +def SKLPort1 : ProcResource<1>; +def SKLPort2 : ProcResource<1>; +def SKLPort3 : ProcResource<1>; +def SKLPort4 : ProcResource<1>; +def SKLPort5 : ProcResource<1>; +def SKLPort6 : ProcResource<1>; +def SKLPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SKLPort01 : ProcResGroup<[SKLPort0, SKLPort1]>; +def SKLPort23 : ProcResGroup<[SKLPort2, SKLPort3]>; +def SKLPort237 : ProcResGroup<[SKLPort2, SKLPort3, SKLPort7]>; +def SKLPort04 : ProcResGroup<[SKLPort0, SKLPort4]>; +def SKLPort05 : ProcResGroup<[SKLPort0, SKLPort5]>; +def SKLPort06 : ProcResGroup<[SKLPort0, SKLPort6]>; +def SKLPort15 : ProcResGroup<[SKLPort1, SKLPort5]>; +def SKLPort16 : ProcResGroup<[SKLPort1, SKLPort6]>; +def SKLPort56 : ProcResGroup<[SKLPort5, SKLPort6]>; +def SKLPort015 : ProcResGroup<[SKLPort0, SKLPort1, SKLPort5]>; +def SKLPort056 : ProcResGroup<[SKLPort0, SKLPort5, SKLPort6]>; +def SKLPort0156: ProcResGroup<[SKLPort0, SKLPort1, SKLPort5, SKLPort6]>; + +// 60 Entry Unified Scheduler +def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4, + SKLPort5, SKLPort6, SKLPort7]> { + let BufferSize=60; +} + +// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// cycles after the memory operand. +def : ReadAdvance; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SKLWriteResPair { + // Register variant is using a single cycle on ExePort. + def : WriteRes { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // latency. + def : WriteRes { + let Latency = !add(Lat, 4); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes; + +// Arithmetic. +defm : SKLWriteResPair; // Simple integer ALU op. +defm : SKLWriteResPair; // Integer multiplication. +def : WriteRes { let Latency = 3; } // Integer multiplication, high part. +def SKLDivider : ProcResource<1>; // Integer division issued on port 0. +def : WriteRes { // Integer division. + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +def : WriteRes; // LEA instructions can't fold loads. + +// Integer shifts and rotates. +defm : SKLWriteResPair; + +// Loads, stores, and moves, not folded with other operations. +def : WriteRes { let Latency = 4; } +def : WriteRes; +def : WriteRes; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def : WriteRes; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : SKLWriteResPair; + +// Floating point. This covers both scalar and vector operations. +defm : SKLWriteResPair; // Floating point add/sub/compare. +defm : SKLWriteResPair; // Floating point multiplication. +defm : SKLWriteResPair; // 10-14 cycles. // Floating point division. +defm : SKLWriteResPair; // Floating point square root. +defm : SKLWriteResPair; // Floating point reciprocal estimate. +defm : SKLWriteResPair; // Floating point reciprocal square root estimate. +// defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm : SKLWriteResPair; // Floating point vector shuffles. +defm : SKLWriteResPair; // Floating point vector blends. +def : WriteRes { // Fp vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +// FMA Scheduling helper class. +// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm : SKLWriteResPair; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair; // Vector integer shifts. +defm : SKLWriteResPair; // Vector integer multiply. +defm : SKLWriteResPair; // Vector shuffles. +defm : SKLWriteResPair; // Vector blends. + +def : WriteRes { // Vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def : WriteRes { // Vector MPSAD. + let Latency = 6; + let ResourceCycles = [1, 2]; +} +def : WriteRes { + let Latency = 6; + let ResourceCycles = [1, 1, 2]; +} + +// Vector bitwise operations. +// These are often used on both floating point and integer vectors. +defm : SKLWriteResPair; // Vector and/or/xor. + +// Conversion between integer and float. +defm : SKLWriteResPair; // Float -> Integer. +defm : SKLWriteResPair; // Integer -> Float. +defm : SKLWriteResPair; // Float -> Float size conversion. + +// Strings instructions. +// Packed Compare Implicit Length Strings, Return Mask +// String instructions. +def : WriteRes { + let Latency = 10; + let ResourceCycles = [3]; +} +def : WriteRes { + let Latency = 10; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes { + let Latency = 10; + let ResourceCycles = [3, 2, 4]; +} +def : WriteRes { + let Latency = 10; + let ResourceCycles = [6, 2, 1]; +} + // Packed Compare Implicit Length Strings, Return Index +def : WriteRes { + let Latency = 11; + let ResourceCycles = [3]; +} +def : WriteRes { + let Latency = 11; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes { + let Latency = 11; + let ResourceCycles = [6, 2]; +} +def : WriteRes { + let Latency = 11; + let ResourceCycles = [3, 2, 2, 1]; +} + +// AES instructions. +def : WriteRes { // Decryption, encryption. + let Latency = 7; + let ResourceCycles = [1]; +} +def : WriteRes { + let Latency = 7; + let ResourceCycles = [1, 1]; +} +def : WriteRes { // InvMixColumn. + let Latency = 14; + let ResourceCycles = [2]; +} +def : WriteRes { + let Latency = 14; + let ResourceCycles = [2, 1]; +} +def : WriteRes { // Key Generation. + let Latency = 10; + let ResourceCycles = [2, 8]; +} +def : WriteRes { + let Latency = 10; + let ResourceCycles = [2, 7, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes { + let Latency = 7; + let ResourceCycles = [2, 1]; +} +def : WriteRes { + let Latency = 7; + let ResourceCycles = [2, 1, 1]; +} + +// Catch-all for expensive system instructions. +def : WriteRes { let Latency = 100; } // def WriteSystem : SchedWrite; + +// AVX2. +defm : SKLWriteResPair; // Fp 256-bit width vector shuffles. +defm : SKLWriteResPair; // 256-bit width vector shuffles. +def : WriteRes { // Variable vector shifts. + let Latency = 2; + let ResourceCycles = [2, 1]; +} +def : WriteRes { + let Latency = 6; + let ResourceCycles = [2, 1, 1]; +} + +// Old microcoded instructions that nobody use. +def : WriteRes { let Latency = 100; } // def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def : WriteRes; + +// Nop, not very useful expect it provides a model for nops! +def : WriteRes; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes { + let Latency = 3; +} + +// x,m / v,v,m. +def : WriteRes { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes; + +// v <- v,m. +def : WriteRes { + let Latency = 5; + let ResourceCycles = [1, 1]; +} + +// Remaining instrs. + +def SKLWriteResGroup0 : SchedWriteRes<[SKLPort23]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup0], (instregex "LDDQUrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "LD_F32m")>; +def: InstRW<[SKLWriteResGroup0], (instregex "LD_F64m")>; +def: InstRW<[SKLWriteResGroup0], (instregex "LD_F80m")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MMX_MOVD64rm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MMX_MOVD64to64rm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MMX_MOVQ64rm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOV(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOV64toPQIrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOV8rm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVAPDrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVAPSrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVDDUPrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVDI2PDIrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVDQArm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVDQUrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVNTDQArm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVSHDUPrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVSLDUPrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVSSrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVSX(16|32|64)rm16")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVSX(16|32|64)rm32")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVSX(16|32|64)rm8")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVUPDrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVUPSrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVZX(16|32|64)rm16")>; +def: InstRW<[SKLWriteResGroup0], (instregex "MOVZX(16|32|64)rm8")>; +def: InstRW<[SKLWriteResGroup0], (instregex "PREFETCHNTA")>; +def: InstRW<[SKLWriteResGroup0], (instregex "PREFETCHT0")>; +def: InstRW<[SKLWriteResGroup0], (instregex "PREFETCHT1")>; +def: InstRW<[SKLWriteResGroup0], (instregex "PREFETCHT2")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VBROADCASTF128")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VBROADCASTI128")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VBROADCASTSSYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VBROADCASTSSrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VLDDQUYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VLDDQUrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOV64toPQIrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVAPDYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVAPDrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVAPSYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVAPSrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVDDUPYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVDDUPrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVDQAYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVDQArm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVDQUYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVDQUrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVNTDQAYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVNTDQArm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVSDrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVSHDUPrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVSLDUPrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVSSrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVUPDYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVUPDrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVUPSYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VMOVUPSrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VPBROADCASTDYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VPBROADCASTDrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VPBROADCASTQYrm")>; +def: InstRW<[SKLWriteResGroup0], (instregex "VPBROADCASTQrm")>; + +def SKLWriteResGroup1 : SchedWriteRes<[SKLPort4,SKLPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup1], (instregex "FBSTPm")>; +def: InstRW<[SKLWriteResGroup1], (instregex "KMOVBmk")>; +def: InstRW<[SKLWriteResGroup1], (instregex "KMOVDmk")>; +def: InstRW<[SKLWriteResGroup1], (instregex "KMOVQmk")>; +def: InstRW<[SKLWriteResGroup1], (instregex "KMOVWmk")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_MOVD64mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_MOVNTQmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_MOVQ64mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOV(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOV8mi")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOV8mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVAPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVAPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVDQAmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVDQUmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVHPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVHPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVLPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVLPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVNTDQmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVNTI_64mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVNTImr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVNTPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVNTPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVPDI2DImr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVPQI2QImr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVPQIto64mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVSSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVUPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MOVUPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "ST_FP32m")>; +def: InstRW<[SKLWriteResGroup1], (instregex "ST_FP64m")>; +def: InstRW<[SKLWriteResGroup1], (instregex "ST_FP80m")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VEXTRACTF128mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VEXTRACTI128mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVAPDYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVAPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVAPSYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVAPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVDQAYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVDQAmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVDQUYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVDQUmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVHPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVHPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVLPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVLPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVNTDQYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVNTDQmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVNTPDYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVNTPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVNTPSYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVNTPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVPDI2DImr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVPQI2QImr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVPQIto64mr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVSDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVSSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVUPDYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVUPDmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVUPSYmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMOVUPSmr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "VMPTRSTm")>; + +def SKLWriteResGroup2 : SchedWriteRes<[SKLPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup2], (instregex "KANDBrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KANDDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KANDNBrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KANDNDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KANDNQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KANDNWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KANDQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KANDWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KMOVBkk")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KMOVDkk")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KMOVQkk")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KMOVWkk")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KNOTBrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KNOTDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KNOTQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KNOTWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KORBrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KORDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KORQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KORWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXNORBrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXNORDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXNORQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXNORWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXORBrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXORDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXORQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "KXORWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PADDSBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PADDSWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PADDUSBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PADDUSWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PAVGBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PAVGWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PCMPEQBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PCMPEQDirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PCMPEQWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PCMPGTBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PCMPGTDirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PCMPGTWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PMAXSWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PMAXUBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PMINSWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PMINUBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSLLDri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSLLDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSLLQri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSLLQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSLLWri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSLLWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRADri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRADrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRAWri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRAWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRLDri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRLDrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRLQri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRLQrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRLWri")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSRLWrr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSUBSBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSUBSWirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSUBUSBirr")>; +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_PSUBUSWirr")>; + +def SKLWriteResGroup3 : SchedWriteRes<[SKLPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MASKMOVQ64")>; + +def SKLWriteResGroup4 : SchedWriteRes<[SKLPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup4], (instregex "COMP_FST0r")>; +def: InstRW<[SKLWriteResGroup4], (instregex "COM_FST0r")>; +def: InstRW<[SKLWriteResGroup4], (instregex "INSERTPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "KMOVBkr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "KMOVDkr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "KMOVQkr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "KMOVWkr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_MOVD64rr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_MOVD64to64rr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PSHUFWri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOV64toPQIrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVDDUPrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVDI2PDIrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVHLPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVLHPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVSDrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVSHDUPrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVSLDUPrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVUPDrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "MOVUPSrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PACKSSDWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PACKSSWBrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PACKUSDWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PACKUSWBrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PALIGNRrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PBLENDWrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVSXBDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVSXBQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVSXBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVSXDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVSXWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVSXWQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVZXBDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVZXBQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVZXBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVZXDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVZXWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PMOVZXWQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PSHUFBrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PSHUFDri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PSHUFHWri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PSHUFLWri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PSLLDQri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PSRLDQri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKHBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKHDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKHWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKLBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKLDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "PUNPCKLWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "SHUFPDrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "SHUFPSrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "UCOM_FPr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "UCOM_Fr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "UNPCKHPDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "UNPCKHPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "UNPCKLPDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "UNPCKLPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VBROADCASTSSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VINSERTPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOV64toPQIrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVDDUPYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVDDUPrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVDI2PDIrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVHLPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVLHPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVSDrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVSHDUPrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVSLDUPrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVUPDYrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVUPDrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVUPSYrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VMOVUPSrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKSSDWYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKSSDWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKSSWBYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKSSWBrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKUSDWYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKUSDWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKUSWBYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPACKUSWBrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPALIGNRYrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPALIGNRrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPBLENDWYrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPBLENDWrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPBROADCASTDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPBROADCASTQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPDYri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPDYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPDri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPSYri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPSYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPSri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPERMILPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVSXBDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVSXBQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVSXBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVSXDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVSXWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVSXWQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVZXBDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVZXBQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVZXBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVZXDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVZXWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPMOVZXWQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFBYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFBrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFDYri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFDri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFHWYri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFHWri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFLWYri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSHUFLWri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSLLDQYri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSLLDQri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSRLDQYri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPSRLDQri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHBWYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHDQYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHQDQYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHQDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHWDYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLBWYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLBWrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLDQYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLWDYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VPUNPCKLWDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VSHUFPDYrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VSHUFPDrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VSHUFPSYrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VSHUFPSrri")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKHPDYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKHPDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKHPSYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKHPSrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKLPDrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[SKLWriteResGroup4], (instregex "VUNPCKLPSrr")>; + +def SKLWriteResGroup5 : SchedWriteRes<[SKLPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup5], (instregex "JMP(16|32|64)r")>; + +def SKLWriteResGroup6 : SchedWriteRes<[SKLPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup6], (instregex "PABSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PABSDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PABSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PADDSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PADDSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PADDUSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PADDUSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PAVGBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PAVGWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PCMPEQBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PCMPEQDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PCMPEQQrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PCMPEQWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PCMPGTBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PCMPGTDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PCMPGTWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMAXSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMAXSDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMAXSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMAXUBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMAXUDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMAXUWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMINSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMINSDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMINSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMINUBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMINUDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PMINUWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSIGNBrr128")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSIGNDrr128")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSIGNWrr128")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSLLDri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSLLQri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSLLWri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSRADri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSRAWri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSRLDri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSRLQri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSRLWri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSUBSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSUBSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSUBUSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "PSUBUSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPABSBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPABSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPABSDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPABSDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPABSWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPABSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDSBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDSWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDUSBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDUSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDUSWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPADDUSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPAVGBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPAVGBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPAVGWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPAVGWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQQYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQQrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPEQWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPGTBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPGTBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPGTDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPGTDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPGTWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPCMPGTWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXSBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXSDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXSDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXSWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXUBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXUBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXUDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXUDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXUWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMAXUWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINSBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINSDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINSDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINSWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINUBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINUBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINUDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINUDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINUWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPMINUWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSIGNBYrr256")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSIGNBrr128")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSIGNDYrr256")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSIGNDrr128")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSIGNWYrr256")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSIGNWrr128")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLDYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLDri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLQYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLQri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLVDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLVDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLVQYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLVQrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLWYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSLLWri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRADYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRADri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRAVDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRAVDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRAWYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRAWri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLDYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLDri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLQYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLQri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLVDYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLVDrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLVQYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLVQrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLWYri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSRLWri")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBSBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBSWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBSWrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBUSBYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBUSBrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBUSWYrr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "VPSUBUSWrr")>; + +def SKLWriteResGroup7 : SchedWriteRes<[SKLPort05]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup7], (instregex "FINCSTP")>; +def: InstRW<[SKLWriteResGroup7], (instregex "FNOP")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_MOVQ64rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PABSBrr64")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PABSDrr64")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PABSWrr64")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PADDBirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PADDDirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PADDQirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PADDWirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PANDNirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PANDirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PORirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PSUBBirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PSUBDirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PSUBQirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PSUBWirr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "MMX_PXORirr")>; + +def SKLWriteResGroup8 : SchedWriteRes<[SKLPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup8], (instregex "ADC(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup8], (instregex "ADC(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup8], (instregex "ADC8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup8], (instregex "ADCX32rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "ADCX64rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "ADOX32rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "ADOX64rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BT(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BT(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BTC(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BTC(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BTR(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BTR(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BTS(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BTS(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CDQ")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CLAC")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVAE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVB(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVG(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVGE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVL(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVLE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVNE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVNO(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVNP(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVNS(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVO(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVP(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CMOVS(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "CQO")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JAE_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JAE_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JA_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JA_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JBE_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JBE_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JB_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JB_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JE_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JE_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JGE_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JGE_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JG_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JG_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JLE_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JLE_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JL_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JL_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JMP_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JMP_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNE_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNE_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNO_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNO_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNP_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNP_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNS_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JNS_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JO_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JO_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JP_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JP_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JS_1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "JS_4")>; +def: InstRW<[SKLWriteResGroup8], (instregex "RORX32ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "RORX64ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SAR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SAR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SAR8r1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SAR8ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SARX32rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SARX64rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SBB(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SBB(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SBB8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETAEr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETBr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETEr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETGEr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETGr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETLEr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETLr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETNEr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETNOr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETNPr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETNSr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETOr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETPr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SETSr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHL(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHL(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHL8r1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHL8ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHLX32rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHLX64rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHR8r1")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHR8ri")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHRX32rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "SHRX64rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "STAC")>; + +def SKLWriteResGroup9 : SchedWriteRes<[SKLPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup9], (instregex "ANDN32rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "ANDN64rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLSI32rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLSI64rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLSMSK32rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLSMSK64rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLSR32rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLSR64rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BZHI32rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BZHI64rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "LEA(16|32|64)r")>; + +def SKLWriteResGroup10 : SchedWriteRes<[SKLPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup10], (instregex "ANDNPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ANDNPSrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ANDPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ANDPSrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "BLENDPDrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "BLENDPSrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVAPDrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVAPSrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVDQArr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVDQUrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVPQI2QIrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVSSrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ORPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ORPSrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PADDBrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PADDDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PADDQrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PADDWrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PANDNrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PANDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PORrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PSUBBrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PSUBDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PSUBQrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PSUBWrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "PXORrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDNPDYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDNPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDNPSYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDNPSrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDPDYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDPSYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VANDPSrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VBLENDPDYrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VBLENDPDrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VBLENDPSYrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VBLENDPSrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVAPDYrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVAPDrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVAPSYrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVAPSrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVDQAYrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVDQArr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVDQUYrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVDQUrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVSSrr(_REV?)")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VORPDYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VORPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VORPSYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VORPSrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDBYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDBrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDDYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDQYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDQrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDWYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPADDWrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPANDNYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPANDNrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPANDYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPANDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPBLENDDYrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPBLENDDrri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPORYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPORrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBBYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBBrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBDYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBQYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBQrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBWYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPSUBWrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPXORYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VPXORrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VXORPDYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VXORPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VXORPSYrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "VXORPSrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XORPDrr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XORPSrr")>; + +def SKLWriteResGroup11 : SchedWriteRes<[SKLPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup11], (instregex "ADD(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "ADD(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "ADD8i8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "ADD8ri")>; +def: InstRW<[SKLWriteResGroup11], (instregex "ADD8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "AND(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "AND(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "AND8i8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "AND8ri")>; +def: InstRW<[SKLWriteResGroup11], (instregex "AND8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CBW")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CLC")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CMC")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CMP(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CMP(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CMP8i8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CMP8ri")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CMP8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "CWDE")>; +def: InstRW<[SKLWriteResGroup11], (instregex "DEC(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "DEC8r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "INC(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "INC8r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "LAHF")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOV(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOV8ri(_alt?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOV8ri_alt")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOV8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVSX(16|32|64)rr16")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVSX(16|32|64)rr32")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVSX(16|32|64)rr8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVZX(16|32|64)rr16")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVZX(16|32|64)rr8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "NEG(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "NEG8r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "NOOP")>; +def: InstRW<[SKLWriteResGroup11], (instregex "NOT(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "NOT8r")>; +def: InstRW<[SKLWriteResGroup11], (instregex "OR(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "OR(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "OR8i8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "OR8ri")>; +def: InstRW<[SKLWriteResGroup11], (instregex "OR8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SAHF")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SGDT64m")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SIDT64m")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SLDT64m")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SMSW16m")>; +def: InstRW<[SKLWriteResGroup11], (instregex "STC")>; +def: InstRW<[SKLWriteResGroup11], (instregex "STRm")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SUB(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SUB(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SUB8i8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SUB8ri")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SUB8rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "SYSCALL")>; +def: InstRW<[SKLWriteResGroup11], (instregex "TEST(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "TEST8i8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "TEST8ri")>; +def: InstRW<[SKLWriteResGroup11], (instregex "TEST8rr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "XCHG(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "XOR(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "XOR(16|32|64)rr(_REV?)")>; +def: InstRW<[SKLWriteResGroup11], (instregex "XOR8i8")>; +def: InstRW<[SKLWriteResGroup11], (instregex "XOR8ri")>; +def: InstRW<[SKLWriteResGroup11], (instregex "XOR8rr(_REV?)")>; + +def SKLWriteResGroup12 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PADDSBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PADDSWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PADDUSBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PADDUSWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PAVGBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PAVGWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PCMPEQBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PCMPEQDirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PCMPEQWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PCMPGTBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PCMPGTDirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PCMPGTWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMAXSWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMAXUBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMINSWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMINUBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSLLDrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSLLQrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSLLWrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSRADrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSRAWrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSRLDrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSRLQrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSRLWrm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSUBSBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSUBSWirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSUBUSBirm")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PSUBUSWirm")>; + +def SKLWriteResGroup13 : SchedWriteRes<[SKLPort0,SKLPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VMASKMOVDQU")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VMASKMOVPDYmr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VMASKMOVPDmr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VMASKMOVPSYmr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VMASKMOVPSmr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPMASKMOVDYmr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPMASKMOVDmr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPMASKMOVQYmr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPMASKMOVQmr")>; + +def SKLWriteResGroup14 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup14], (instregex "FCOM32m")>; +def: InstRW<[SKLWriteResGroup14], (instregex "FCOM64m")>; +def: InstRW<[SKLWriteResGroup14], (instregex "FCOMP32m")>; +def: InstRW<[SKLWriteResGroup14], (instregex "FCOMP64m")>; +def: InstRW<[SKLWriteResGroup14], (instregex "INSERTPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PINSRWirmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PSHUFWmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PUNPCKHBWirm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PUNPCKHDQirm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PUNPCKHWDirm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PUNPCKLBWirm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PUNPCKLDQirm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_PUNPCKLWDirm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MOVHPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MOVHPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MOVLPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MOVLPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PACKSSDWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PACKSSWBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PACKUSDWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PACKUSWBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PALIGNRrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PBLENDWrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PINSRBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PINSRDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PINSRQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PINSRWrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVSXBDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVSXBQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVSXBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVSXDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVSXWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVSXWQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVZXBDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVZXBQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVZXBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVZXDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVZXWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PMOVZXWQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PSHUFBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PSHUFDmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PSHUFHWmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PSHUFLWmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKHBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKHDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKHWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKLBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKLDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "PUNPCKLWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "SHUFPDrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "SHUFPSrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "UNPCKHPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "UNPCKHPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "UNPCKLPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "UNPCKLPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VINSERTPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VMOVHPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VMOVHPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VMOVLPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VMOVLPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKSSDWYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKSSDWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKSSWBYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKSSWBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKUSDWYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKUSDWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKUSWBYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPACKUSWBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPALIGNRYrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPALIGNRrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPBLENDWYrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPBLENDWrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPBROADCASTBYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPBROADCASTBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPBROADCASTWYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPBROADCASTWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPDYmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPDYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPDmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPSYmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPSYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPSmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPERMILPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPINSRBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPINSRDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPINSRQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPINSRWrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVSXBDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVSXBQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVSXBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVSXDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVSXWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVSXWQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVZXBDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVZXBQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVZXBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVZXDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVZXWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPMOVZXWQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFBYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFBrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFDYmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFDmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFHWYmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFHWmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFLWYmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPSHUFLWmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHBWYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHDQYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHQDQYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHWDYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLBWYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLDQYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLQDQYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLWDYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VPUNPCKLWDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VSHUFPDYrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VSHUFPDrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VSHUFPSYrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VSHUFPSrmi")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKHPDYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKHPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKHPSYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKHPSrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKLPDrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKLPSYrm")>; +def: InstRW<[SKLWriteResGroup14], (instregex "VUNPCKLPSrm")>; + +def SKLWriteResGroup15 : SchedWriteRes<[SKLPort6,SKLPort23]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup15], (instregex "FARJMP64")>; +def: InstRW<[SKLWriteResGroup15], (instregex "JMP(16|32|64)m")>; + +def SKLWriteResGroup16 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup16], (instregex "PABSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PABSDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PABSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PADDSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PADDSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PADDUSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PADDUSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PAVGBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PAVGWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PCMPEQBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PCMPEQDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PCMPEQQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PCMPEQWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PCMPGTBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PCMPGTDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PCMPGTWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMAXSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMAXSDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMAXSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMAXUBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMAXUDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMAXUWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMINSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMINSDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMINSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMINUBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMINUDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PMINUWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSIGNBrm128")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSIGNDrm128")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSIGNWrm128")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSLLDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSLLQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSLLWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSRADrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSRAWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSRLDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSRLQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSRLWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSUBSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSUBSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSUBUSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PSUBUSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPABSBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPABSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPABSDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPABSDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPABSWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPABSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDSBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDSWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDUSBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDUSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDUSWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPADDUSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPAVGBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPAVGBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPAVGWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPAVGWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQQYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPEQWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPGTBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPGTBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPGTDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPGTDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPGTWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPCMPGTWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXSBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXSDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXSDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXSWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXUBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXUBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXUDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXUDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXUWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMAXUWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINSBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINSDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINSDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINSWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINUBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINUBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINUDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINUDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINUWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPMINUWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSIGNBYrm256")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSIGNBrm128")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSIGNDYrm256")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSIGNDrm128")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSIGNWYrm256")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSIGNWrm128")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLQYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLVDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLVDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLVQYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLVQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSLLWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRADYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRADrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRAVDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRAVDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRAWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRAWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLQYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLVDYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLVDrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLVQYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLVQrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSRLWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBSBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBSWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBSWrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBUSBYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBUSBrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBUSWYrm")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPSUBUSWrm")>; + +def SKLWriteResGroup17 : SchedWriteRes<[SKLPort23,SKLPort05]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PABSBrm64")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PABSDrm64")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PABSWrm64")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PADDBirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PADDDirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PADDQirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PADDWirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PANDNirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PANDirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PORirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PSIGNWrm64")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PSUBBirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PSUBDirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PSUBQirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PSUBWirm")>; +def: InstRW<[SKLWriteResGroup17], (instregex "MMX_PXORirm")>; + +def SKLWriteResGroup18 : SchedWriteRes<[SKLPort23,SKLPort06]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup18], (instregex "ADC(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "ADC8rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "ADCX32rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "ADCX64rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "ADOX32rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "ADOX64rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "BT(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVAE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVB(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVG(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVGE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVL(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVLE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVNE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVNO(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVNP(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVNS(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVO(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVP(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "CMOVS(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "RORX32mi")>; +def: InstRW<[SKLWriteResGroup18], (instregex "RORX64mi")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SARX32rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SARX64rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SBB(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SBB8rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SHLX32rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SHLX64rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SHRX32rm")>; +def: InstRW<[SKLWriteResGroup18], (instregex "SHRX64rm")>; + +def SKLWriteResGroup19 : SchedWriteRes<[SKLPort23,SKLPort15]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup19], (instregex "ANDN32rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "ANDN64rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BLSI32rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BLSI64rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BLSMSK32rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BLSMSK64rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BLSR32rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BLSR64rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BZHI32rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "BZHI64rm")>; +def: InstRW<[SKLWriteResGroup19], (instregex "MOVBE(16|32|64)rm")>; + +def SKLWriteResGroup20 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup20], (instregex "ANDNPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "ANDNPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "ANDPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "ANDPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "BLENDPDrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "BLENDPSrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "ORPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "ORPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PADDBrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PADDDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PADDQrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PADDWrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PANDNrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PANDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PORrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PSUBBrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PSUBDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PSUBQrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PSUBWrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "PXORrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDNPDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDNPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDNPSYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDNPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDPDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDPSYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VANDPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VBLENDPDYrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VBLENDPDrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VBLENDPSYrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VBLENDPSrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VINSERTF128rm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VINSERTI128rm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VMASKMOVPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VMASKMOVPSYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VMASKMOVPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VORPDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VORPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VORPSYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VORPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDBYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDBrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDQYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDQrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDWYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPADDWrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPANDNYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPANDNrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPANDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPANDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPBLENDDYrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPBLENDDrmi")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPMASKMOVDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPMASKMOVDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPMASKMOVQYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPMASKMOVQrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPORYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPORrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBBYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBBrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBQYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBQrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBWYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPSUBWrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPXORYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VPXORrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VXORPDYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VXORPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VXORPSYrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "VXORPSrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "XORPDrm")>; +def: InstRW<[SKLWriteResGroup20], (instregex "XORPSrm")>; + +def SKLWriteResGroup21 : SchedWriteRes<[SKLPort23,SKLPort0156]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup21], (instregex "ADD(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "ADD8rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "AND(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "AND8rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "CMP(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup21], (instregex "CMP(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup21], (instregex "CMP(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "CMP8mi")>; +def: InstRW<[SKLWriteResGroup21], (instregex "CMP8mr")>; +def: InstRW<[SKLWriteResGroup21], (instregex "CMP8rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "OR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "OR8rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "POP(16|32|64)r(mr?)")>; +def: InstRW<[SKLWriteResGroup21], (instregex "SUB(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "SUB8rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "TEST(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "TEST8mi")>; +def: InstRW<[SKLWriteResGroup21], (instregex "TEST8rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "XOR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup21], (instregex "XOR8rm")>; + +def SKLWriteResGroup22 : SchedWriteRes<[SKLPort237,SKLPort0156]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup22], (instregex "SFENCE")>; + +def SKLWriteResGroup23 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup23], (instregex "EXTRACTPSmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "PEXTRBmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "PEXTRDmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "PEXTRQmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "PEXTRWmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "STMXCSR")>; +def: InstRW<[SKLWriteResGroup23], (instregex "VEXTRACTPSmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "VPEXTRBmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "VPEXTRDmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "VPEXTRQmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "VPEXTRWmr")>; +def: InstRW<[SKLWriteResGroup23], (instregex "VSTMXCSR")>; + +def SKLWriteResGroup24 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup24], (instregex "FNSTCW16m")>; + +def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup25], (instregex "SETAEm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETBm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETEm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETGEm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETGm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETLEm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETLm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETNEm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETNOm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETNPm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETNSm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETOm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETPm")>; +def: InstRW<[SKLWriteResGroup25], (instregex "SETSm")>; + +def SKLWriteResGroup26 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup26], (instregex "MOVBE(16|32|64)mr")>; + +def SKLWriteResGroup27 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup27], (instregex "PUSH(16|32|64)r(mr?)")>; +def: InstRW<[SKLWriteResGroup27], (instregex "PUSH64i8")>; +def: InstRW<[SKLWriteResGroup27], (instregex "STOSB")>; +def: InstRW<[SKLWriteResGroup27], (instregex "STOSL")>; +def: InstRW<[SKLWriteResGroup27], (instregex "STOSQ")>; +def: InstRW<[SKLWriteResGroup27], (instregex "STOSW")>; + +def SKLWriteResGroup28 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 1; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup28], (instregex "BTC(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup28], (instregex "BTR(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup28], (instregex "BTS(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SAR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SAR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SAR8m1")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SAR8mi")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHL(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHL(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHL8m1")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHL8mi")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHR8m1")>; +def: InstRW<[SKLWriteResGroup28], (instregex "SHR8mi")>; + +def SKLWriteResGroup29 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 1; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup29], (instregex "ADD(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup29], (instregex "ADD(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "ADD8mi")>; +def: InstRW<[SKLWriteResGroup29], (instregex "ADD8mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "AND(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup29], (instregex "AND(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "AND8mi")>; +def: InstRW<[SKLWriteResGroup29], (instregex "AND8mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "DEC(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "DEC8m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "INC(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "INC8m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "NEG(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "NEG8m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "NOT(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "NOT8m")>; +def: InstRW<[SKLWriteResGroup29], (instregex "OR(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup29], (instregex "OR(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "OR8mi")>; +def: InstRW<[SKLWriteResGroup29], (instregex "OR8mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "POP(16|32|64)rmm")>; +def: InstRW<[SKLWriteResGroup29], (instregex "PUSH(16|32|64)rmm")>; +def: InstRW<[SKLWriteResGroup29], (instregex "SUB(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup29], (instregex "SUB(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "SUB8mi")>; +def: InstRW<[SKLWriteResGroup29], (instregex "SUB8mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "XOR(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup29], (instregex "XOR(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "XOR8mi")>; +def: InstRW<[SKLWriteResGroup29], (instregex "XOR8mr")>; + +def SKLWriteResGroup30 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> { + let Latency = 1; + let NumMicroOps = 5; + let ResourceCycles = [2,2,1]; +} + +def SKLWriteResGroup31 : SchedWriteRes<[SKLPort0]> { + let Latency = 2; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup31], (instregex "COMISDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "COMISSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MMX_MOVD64grr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MMX_PMOVMSKBrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MOVMSKPDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MOVMSKPSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MOVPDI2DIrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MOVPQIto64rr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "PMOVMSKBrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "UCOMISDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "UCOMISSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VCOMISDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VCOMISSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VMOVMSKPDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VMOVMSKPSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VMOVPQIto64rr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPMOVMSKBYrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPMOVMSKBrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VTESTPDYrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VTESTPDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VTESTPSYrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VTESTPSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VUCOMISDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VUCOMISSrr")>; + +def SKLWriteResGroup32 : SchedWriteRes<[SKLPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup32], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[SKLWriteResGroup32], (instregex "MMX_PINSRWirri")>; +def: InstRW<[SKLWriteResGroup32], (instregex "PINSRBrr")>; +def: InstRW<[SKLWriteResGroup32], (instregex "PINSRDrr")>; +def: InstRW<[SKLWriteResGroup32], (instregex "PINSRQrr")>; +def: InstRW<[SKLWriteResGroup32], (instregex "PINSRWrri")>; +def: InstRW<[SKLWriteResGroup32], (instregex "VPINSRBrr")>; +def: InstRW<[SKLWriteResGroup32], (instregex "VPINSRDrr")>; +def: InstRW<[SKLWriteResGroup32], (instregex "VPINSRQrr")>; +def: InstRW<[SKLWriteResGroup32], (instregex "VPINSRWrri")>; + +def SKLWriteResGroup33 : SchedWriteRes<[SKLPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup33], (instregex "FDECSTP")>; +def: InstRW<[SKLWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>; + +def SKLWriteResGroup34 : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup34], (instregex "CMOVA(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup34], (instregex "CMOVBE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROL(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROL(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROL8r1")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROL8ri")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROR8r1")>; +def: InstRW<[SKLWriteResGroup34], (instregex "ROR8ri")>; +def: InstRW<[SKLWriteResGroup34], (instregex "SETAr")>; +def: InstRW<[SKLWriteResGroup34], (instregex "SETBEr")>; + +def SKLWriteResGroup35 : SchedWriteRes<[SKLPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup35], (instregex "BLENDVPDrr0")>; +def: InstRW<[SKLWriteResGroup35], (instregex "BLENDVPSrr0")>; +def: InstRW<[SKLWriteResGroup35], (instregex "PBLENDVBrr0")>; +def: InstRW<[SKLWriteResGroup35], (instregex "VBLENDVPDYrr")>; +def: InstRW<[SKLWriteResGroup35], (instregex "VBLENDVPDrr")>; +def: InstRW<[SKLWriteResGroup35], (instregex "VBLENDVPSYrr")>; +def: InstRW<[SKLWriteResGroup35], (instregex "VBLENDVPSrr")>; +def: InstRW<[SKLWriteResGroup35], (instregex "VPBLENDVBYrr")>; +def: InstRW<[SKLWriteResGroup35], (instregex "VPBLENDVBrr")>; + +def SKLWriteResGroup36 : SchedWriteRes<[SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup36], (instregex "LFENCE")>; +def: InstRW<[SKLWriteResGroup36], (instregex "WAIT")>; +def: InstRW<[SKLWriteResGroup36], (instregex "XGETBV")>; + +def SKLWriteResGroup37 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup37], (instregex "COMISDrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "COMISSrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "UCOMISDrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "UCOMISSrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VCOMISDrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VCOMISSrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VTESTPDYrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VTESTPDrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VTESTPSYrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VTESTPSrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VUCOMISDrm")>; +def: InstRW<[SKLWriteResGroup37], (instregex "VUCOMISSrm")>; + +def SKLWriteResGroup38 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup38], (instregex "PSLLDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PSLLQrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PSLLWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PSRADrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PSRAWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PSRLDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PSRLQrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PSRLWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSLLDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSLLQrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSLLWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSRADrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSRAWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSRLDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSRLQrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPSRLWrr")>; + +def SKLWriteResGroup39 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup39], (instregex "CLFLUSH")>; + +def SKLWriteResGroup40 : SchedWriteRes<[SKLPort06,SKLPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup40], (instregex "BEXTR32rr")>; +def: InstRW<[SKLWriteResGroup40], (instregex "BEXTR64rr")>; +def: InstRW<[SKLWriteResGroup40], (instregex "BSWAP(16|32|64)r")>; + +def SKLWriteResGroup41 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup41], (instregex "ADC8i8")>; +def: InstRW<[SKLWriteResGroup41], (instregex "ADC8ri")>; +def: InstRW<[SKLWriteResGroup41], (instregex "CWD")>; +def: InstRW<[SKLWriteResGroup41], (instregex "JRCXZ")>; +def: InstRW<[SKLWriteResGroup41], (instregex "SBB8i8")>; +def: InstRW<[SKLWriteResGroup41], (instregex "SBB8ri")>; + +def SKLWriteResGroup42 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup42], (instregex "MMX_PACKSSDWirm")>; +def: InstRW<[SKLWriteResGroup42], (instregex "MMX_PACKSSWBirm")>; +def: InstRW<[SKLWriteResGroup42], (instregex "MMX_PACKUSWBirm")>; + +def SKLWriteResGroup43 : SchedWriteRes<[SKLPort23,SKLPort06]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup43], (instregex "CMOVA(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup43], (instregex "CMOVBE(16|32|64)rm")>; + +def SKLWriteResGroup44 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup44], (instregex "BLENDVPDrm0")>; +def: InstRW<[SKLWriteResGroup44], (instregex "BLENDVPSrm0")>; +def: InstRW<[SKLWriteResGroup44], (instregex "PBLENDVBrm0")>; +def: InstRW<[SKLWriteResGroup44], (instregex "VBLENDVPDYrm")>; +def: InstRW<[SKLWriteResGroup44], (instregex "VBLENDVPDrm")>; +def: InstRW<[SKLWriteResGroup44], (instregex "VBLENDVPSYrm")>; +def: InstRW<[SKLWriteResGroup44], (instregex "VBLENDVPSrm")>; +def: InstRW<[SKLWriteResGroup44], (instregex "VPBLENDVBYrm")>; +def: InstRW<[SKLWriteResGroup44], (instregex "VPBLENDVBrm")>; + +def SKLWriteResGroup45 : SchedWriteRes<[SKLPort23,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup45], (instregex "LEAVE64")>; +def: InstRW<[SKLWriteResGroup45], (instregex "SCASB")>; +def: InstRW<[SKLWriteResGroup45], (instregex "SCASL")>; +def: InstRW<[SKLWriteResGroup45], (instregex "SCASQ")>; +def: InstRW<[SKLWriteResGroup45], (instregex "SCASW")>; + +def SKLWriteResGroup46 : SchedWriteRes<[SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup46], (instregex "MFENCE")>; + +def SKLWriteResGroup47 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup47], (instregex "FNSTSWm")>; + +def SKLWriteResGroup48 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup48], (instregex "FLDCW16m")>; + +def SKLWriteResGroup49 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup49], (instregex "LDMXCSR")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VLDMXCSR")>; + +def SKLWriteResGroup50 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup50], (instregex "KMOVBkm")>; +def: InstRW<[SKLWriteResGroup50], (instregex "KMOVDkm")>; +def: InstRW<[SKLWriteResGroup50], (instregex "KMOVQkm")>; +def: InstRW<[SKLWriteResGroup50], (instregex "KMOVWkm")>; + +def SKLWriteResGroup51 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup51], (instregex "LRETQ")>; +def: InstRW<[SKLWriteResGroup51], (instregex "RETQ")>; + +def SKLWriteResGroup52 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup52], (instregex "BEXTR32rm")>; +def: InstRW<[SKLWriteResGroup52], (instregex "BEXTR64rm")>; + +def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { + let Latency = 2; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup53], (instregex "SETAm")>; +def: InstRW<[SKLWriteResGroup53], (instregex "SETBEm")>; + +def SKLWriteResGroup54 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup54], (instregex "CALL(16|32|64)r")>; + +def SKLWriteResGroup55 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup55], (instregex "CALL64pcrel32")>; + +def SKLWriteResGroup56 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup56], (instregex "ROL(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup56], (instregex "ROL(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup56], (instregex "ROL8m1")>; +def: InstRW<[SKLWriteResGroup56], (instregex "ROL8mi")>; +def: InstRW<[SKLWriteResGroup56], (instregex "ROR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup56], (instregex "ROR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup56], (instregex "ROR8m1")>; +def: InstRW<[SKLWriteResGroup56], (instregex "ROR8mi")>; + +def SKLWriteResGroup57 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup57], (instregex "XADD(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup57], (instregex "XADD8rm")>; + +def SKLWriteResGroup58 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup58], (instregex "CALL(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup58], (instregex "FARCALL64")>; + +def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup59], (instregex "KADDBrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KADDDrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KADDQrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KADDWrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KMOVBrk")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KMOVDrk")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KMOVQrk")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KMOVWrk")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KORTESTBrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KORTESTDrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KORTESTQrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KORTESTWrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KTESTBrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KTESTDrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KTESTQrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "KTESTWrr")>; + +def SKLWriteResGroup60 : SchedWriteRes<[SKLPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup60], (instregex "BSF(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "BSR(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "IMUL64rr(i8?)")>; +def: InstRW<[SKLWriteResGroup60], (instregex "IMUL8r")>; +def: InstRW<[SKLWriteResGroup60], (instregex "LZCNT(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "MUL8r")>; +def: InstRW<[SKLWriteResGroup60], (instregex "PDEP32rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "PDEP64rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "PEXT32rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "PEXT64rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "POPCNT(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[SKLWriteResGroup60], (instregex "SHRD(16|32|64)rri8")>; +def: InstRW<[SKLWriteResGroup60], (instregex "TZCNT(16|32|64)rr")>; + +def SKLWriteResGroup60_16 : SchedWriteRes<[SKLPort1, SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup60_16], (instregex "IMUL16rr(i8?)")>; + +def SKLWriteResGroup60_32 : SchedWriteRes<[SKLPort1]> { + let Latency = 3; + let NumMicroOps = 1; +} +def: InstRW<[SKLWriteResGroup60_32], (instregex "IMUL32rr(i8?)")>; + +def SKLWriteResGroup61 : SchedWriteRes<[SKLPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup61], (instregex "ADD_FPrST0")>; +def: InstRW<[SKLWriteResGroup61], (instregex "ADD_FST0r")>; +def: InstRW<[SKLWriteResGroup61], (instregex "ADD_FrST0")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTLBri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTLDri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTLQri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTLWri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTRBri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTRDri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTRQri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KSHIFTRWri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KUNPCKBWrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KUNPCKDQrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "KUNPCKWDrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "MMX_PSADBWirr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "PCMPGTQrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "PSADBWrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "SUBR_FPrST0")>; +def: InstRW<[SKLWriteResGroup61], (instregex "SUBR_FST0r")>; +def: InstRW<[SKLWriteResGroup61], (instregex "SUBR_FrST0")>; +def: InstRW<[SKLWriteResGroup61], (instregex "SUB_FPrST0")>; +def: InstRW<[SKLWriteResGroup61], (instregex "SUB_FST0r")>; +def: InstRW<[SKLWriteResGroup61], (instregex "SUB_FrST0")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VBROADCASTSDYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VBROADCASTSSYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VEXTRACTF128rr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VEXTRACTI128rr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VINSERTF128rr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VINSERTI128rr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPBROADCASTBYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPBROADCASTBrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPBROADCASTDYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPBROADCASTQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPBROADCASTWYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPBROADCASTWrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPCMPGTQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPCMPGTQrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPERM2F128rr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPERM2I128rr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPERMDYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPERMPDYri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPERMPSYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPERMQYri")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVSXBDYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVSXBQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVSXBWYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVSXDQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVSXWDYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVSXWQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVZXBDYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVZXBQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVZXBWYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVZXDQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVZXWDYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPMOVZXWQYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPSADBWYrr")>; +def: InstRW<[SKLWriteResGroup61], (instregex "VPSADBWrr")>; + +def SKLWriteResGroup62 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup62], (instregex "EXTRACTPSrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "MMX_PEXTRWirri")>; +def: InstRW<[SKLWriteResGroup62], (instregex "PEXTRBrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "PEXTRDrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "PEXTRQrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "PEXTRWri")>; +def: InstRW<[SKLWriteResGroup62], (instregex "PEXTRWrr_REV")>; +def: InstRW<[SKLWriteResGroup62], (instregex "PTESTrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VEXTRACTPSrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VPEXTRBrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VPEXTRDrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VPEXTRQrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VPEXTRWri")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VPEXTRWrr_REV")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VPTESTYrr")>; +def: InstRW<[SKLWriteResGroup62], (instregex "VPTESTrr")>; + +def SKLWriteResGroup63 : SchedWriteRes<[SKLPort0,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup63], (instregex "FNSTSW16r")>; + +def SKLWriteResGroup64 : SchedWriteRes<[SKLPort1,SKLPort23]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup64], (instregex "BSF(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "BSR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "IMUL64m")>; +def: InstRW<[SKLWriteResGroup64], (instregex "IMUL(32|64)rm(i8?)")>; +def: InstRW<[SKLWriteResGroup64], (instregex "IMUL8m")>; +def: InstRW<[SKLWriteResGroup64], (instregex "LZCNT(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "MUL64m")>; +def: InstRW<[SKLWriteResGroup64], (instregex "MUL8m")>; +def: InstRW<[SKLWriteResGroup64], (instregex "PDEP32rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "PDEP64rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "PEXT32rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "PEXT64rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "POPCNT(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup64], (instregex "TZCNT(16|32|64)rm")>; + +def SKLWriteResGroup64_16 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup64_16], (instregex "IMUL16rm(i8?)")>; + +def SKLWriteResGroup64_16_2 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> { + let Latency = 3; + let NumMicroOps = 5; +} +def: InstRW<[SKLWriteResGroup64_16_2], (instregex "IMUL16m")>; +def: InstRW<[SKLWriteResGroup64_16_2], (instregex "MUL16m")>; + +def SKLWriteResGroup64_32 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup64_32], (instregex "IMUL32m")>; +def: InstRW<[SKLWriteResGroup64_32], (instregex "MUL32m")>; + +def SKLWriteResGroup65 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup65], (instregex "ADD_F32m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "ADD_F64m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "ILD_F16m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "ILD_F32m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "ILD_F64m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "MMX_PSADBWirm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "PCMPGTQrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "PSADBWrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "SUBR_F32m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "SUBR_F64m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "SUB_F32m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "SUB_F64m")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPCMPGTQYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPCMPGTQrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPERM2F128rm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPERM2I128rm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPERMDYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPERMPDYmi")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPERMPSYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPERMQYmi")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVSXBDYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVSXBQYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVSXBWYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVSXDQYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVSXWDYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVSXWQYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVZXBDYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVZXBQYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVZXBWYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVZXDQYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVZXWDYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPMOVZXWQYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPSADBWYrm")>; +def: InstRW<[SKLWriteResGroup65], (instregex "VPSADBWrm")>; + +def SKLWriteResGroup66 : SchedWriteRes<[SKLPort06]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup66], (instregex "ROL(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "ROL8rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "ROR(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "ROR8rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "SAR(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "SAR8rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "SHL(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "SHL8rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "SHR(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup66], (instregex "SHR8rCL")>; + +def SKLWriteResGroup67 : SchedWriteRes<[SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup67], (instregex "XADD(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup67], (instregex "XADD8rr")>; +def: InstRW<[SKLWriteResGroup67], (instregex "XCHG8rr")>; + +def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup68], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[SKLWriteResGroup68], (instregex "MMX_PHSUBSWrr64")>; + +def SKLWriteResGroup69 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup69], (instregex "PHADDSWrr128")>; +def: InstRW<[SKLWriteResGroup69], (instregex "PHSUBSWrr128")>; +def: InstRW<[SKLWriteResGroup69], (instregex "VPHADDSWrr128")>; +def: InstRW<[SKLWriteResGroup69], (instregex "VPHADDSWrr256")>; +def: InstRW<[SKLWriteResGroup69], (instregex "VPHSUBSWrr128")>; +def: InstRW<[SKLWriteResGroup69], (instregex "VPHSUBSWrr256")>; + +def SKLWriteResGroup70 : SchedWriteRes<[SKLPort5,SKLPort05]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup70], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[SKLWriteResGroup70], (instregex "MMX_PHADDrr64")>; +def: InstRW<[SKLWriteResGroup70], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[SKLWriteResGroup70], (instregex "MMX_PHSUBWrr64")>; + +def SKLWriteResGroup71 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup71], (instregex "PHADDDrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PHADDWrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PHSUBDrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PHSUBWrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHADDDYrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHADDDrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHADDWYrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHADDWrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHSUBDYrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHSUBDrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHSUBWYrr")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPHSUBWrr")>; + +def SKLWriteResGroup72 : SchedWriteRes<[SKLPort5,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup72], (instregex "MMX_PACKSSDWirr")>; +def: InstRW<[SKLWriteResGroup72], (instregex "MMX_PACKSSWBirr")>; +def: InstRW<[SKLWriteResGroup72], (instregex "MMX_PACKUSWBirr")>; + +def SKLWriteResGroup73 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup73], (instregex "CLD")>; + +def SKLWriteResGroup74 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup74], (instregex "RCL(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RCL(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RCL8r1")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RCL8ri")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RCR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RCR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RCR8r1")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RCR8ri")>; + +def SKLWriteResGroup75 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup75], (instregex "PTESTrm")>; +def: InstRW<[SKLWriteResGroup75], (instregex "VPTESTYrm")>; +def: InstRW<[SKLWriteResGroup75], (instregex "VPTESTrm")>; + +def SKLWriteResGroup76 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup76], (instregex "ISTT_FP16m")>; +def: InstRW<[SKLWriteResGroup76], (instregex "ISTT_FP32m")>; +def: InstRW<[SKLWriteResGroup76], (instregex "ISTT_FP64m")>; +def: InstRW<[SKLWriteResGroup76], (instregex "IST_F16m")>; +def: InstRW<[SKLWriteResGroup76], (instregex "IST_F32m")>; +def: InstRW<[SKLWriteResGroup76], (instregex "IST_FP16m")>; +def: InstRW<[SKLWriteResGroup76], (instregex "IST_FP32m")>; +def: InstRW<[SKLWriteResGroup76], (instregex "IST_FP64m")>; + +def SKLWriteResGroup77 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKLWriteResGroup77], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[SKLWriteResGroup77], (instregex "MMX_PHSUBSWrm64")>; + +def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup78], (instregex "PHADDSWrm128")>; +def: InstRW<[SKLWriteResGroup78], (instregex "PHSUBSWrm128")>; +def: InstRW<[SKLWriteResGroup78], (instregex "VPHADDSWrm128")>; +def: InstRW<[SKLWriteResGroup78], (instregex "VPHADDSWrm256")>; +def: InstRW<[SKLWriteResGroup78], (instregex "VPHSUBSWrm128")>; +def: InstRW<[SKLWriteResGroup78], (instregex "VPHSUBSWrm256")>; + +def SKLWriteResGroup79 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort05]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup79], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[SKLWriteResGroup79], (instregex "MMX_PHADDrm64")>; +def: InstRW<[SKLWriteResGroup79], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[SKLWriteResGroup79], (instregex "MMX_PHSUBWrm64")>; + +def SKLWriteResGroup80 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup80], (instregex "PHADDDrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "PHADDWrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "PHSUBDrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "PHSUBWrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHADDDYrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHADDDrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHADDWYrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHADDWrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHSUBDYrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHSUBDrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHSUBWYrm")>; +def: InstRW<[SKLWriteResGroup80], (instregex "VPHSUBWrm")>; + +def SKLWriteResGroup81 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 3; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup81], (instregex "ROR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup81], (instregex "ROR8mCL")>; + +def SKLWriteResGroup82 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup82], (instregex "RCL(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "RCL(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "RCL8m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "RCL8mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "RCR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "RCR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "RCR8m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "RCR8mi")>; + +def SKLWriteResGroup83 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 3; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup83], (instregex "ROL(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup83], (instregex "ROL8mCL")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SAR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SAR8mCL")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SHL(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SHL8mCL")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SHR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SHR8mCL")>; + +def SKLWriteResGroup84 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup84], (instregex "ADC(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup84], (instregex "ADC8mi")>; + +def SKLWriteResGroup85 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[SKLWriteResGroup85], (instregex "ADC(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup85], (instregex "ADC8mr")>; +def: InstRW<[SKLWriteResGroup85], (instregex "CMPXCHG(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "CMPXCHG8rm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "SBB(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup85], (instregex "SBB(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup85], (instregex "SBB8mi")>; +def: InstRW<[SKLWriteResGroup85], (instregex "SBB8mr")>; + +def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup86], (instregex "AESDECLASTrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "AESDECrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "AESENCLASTrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "AESENCrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MMX_PMADDWDirr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MMX_PMULHUWirr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MMX_PMULHWirr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MMX_PMULLWirr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MUL_FPrST0")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MUL_FST0r")>; +def: InstRW<[SKLWriteResGroup86], (instregex "MUL_FrST0")>; +def: InstRW<[SKLWriteResGroup86], (instregex "RCPPSr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "RCPSSr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "RSQRTPSr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "RSQRTSSr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VAESDECLASTrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VAESDECrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VAESENCLASTrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VAESENCrr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VRCPPSYr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VRCPPSr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VRCPSSr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VRSQRTPSYr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VRSQRTPSr")>; +def: InstRW<[SKLWriteResGroup86], (instregex "VRSQRTSSr")>; + +def SKLWriteResGroup87 : SchedWriteRes<[SKLPort01]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup87], (instregex "ADDPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "ADDPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "ADDSDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "ADDSSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "ADDSUBPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "ADDSUBPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "MULPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "MULPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "MULSDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "MULSSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "SUBPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "SUBPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "SUBSDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "SUBSSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDPDYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDPSYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDSDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDSSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDSUBPDYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDSUBPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDSUBPSYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VADDSUBPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD132PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD132PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD132PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD132PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD132SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD132SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD213PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD213PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD213PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD213PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD213SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD213SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD231PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD231PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD231PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD231PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD231SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADD231SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB132PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB132PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB132PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB132PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB213PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB213PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB213PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB213PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB231PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB231PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB231PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMADDSUB231PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB132PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB132PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB132PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB132PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB132SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB132SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB213PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB213PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB213PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB213PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB213SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB213SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB231PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB231PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB231PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB231PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB231SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUB231SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD132PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD132PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD132PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD132PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD213PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD213PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD213PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD213PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD231PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD231PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD231PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFMSUBADD231PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD132PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD132PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD132PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD132PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD132SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD132SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD213PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD213PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD213PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD213PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD213SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD213SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD231PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD231PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD231PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD231PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD231SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMADD231SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB132PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB132PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB132PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB132PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB132SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB132SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB213PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB213PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB213PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB213PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB213SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB213SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB231PDYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB231PDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB231PSYr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB231PSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB231SDr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VFNMSUB231SSr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VMULPDYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VMULPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VMULPSYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VMULPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VMULSDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VMULSSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VSUBPDYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VSUBPDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VSUBPSYrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VSUBPSrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VSUBSDrr")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VSUBSSrr")>; + +def SKLWriteResGroup88 : SchedWriteRes<[SKLPort05]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} + +def SKLWriteResGroup89 : SchedWriteRes<[SKLPort015]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup89], (instregex "CMPPDrri")>; +def: InstRW<[SKLWriteResGroup89], (instregex "CMPPSrri")>; +def: InstRW<[SKLWriteResGroup89], (instregex "CMPSSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "CVTDQ2PSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "CVTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "CVTTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MAXPDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MAXPSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MAXSDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MAXSSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MINPDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MINPSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MINSDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "MINSSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMADDUBSWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMADDWDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMULDQrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMULHRSWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMULHUWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMULHWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMULLWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "PMULUDQrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCMPPDYrri")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCMPPDrri")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCMPPSYrri")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCMPPSrri")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCMPSDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCMPSSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTTPS2DQYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMAXPDYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMAXPDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMAXPSYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMAXPSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMAXSDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMAXSSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMINPDYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMINPDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMINPSYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMINPSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMINSDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VMINSSrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMADDUBSWYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMADDUBSWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMADDWDYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMADDWDrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULDQYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULDQrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULHRSWYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULHRSWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULHUWYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULHUWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULHWYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULHWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULLWYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULLWrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULUDQYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VPMULUDQrr")>; + +def SKLWriteResGroup90 : SchedWriteRes<[SKLPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup90], (instregex "MPSADBWrri")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VMPSADBWYrri")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VMPSADBWrri")>; + +def SKLWriteResGroup91 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup91], (instregex "AESDECLASTrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "AESDECrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "AESENCLASTrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "AESENCrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_PMADDWDirm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_PMULHUWirm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_PMULHWirm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_PMULLWirm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MUL_F32m")>; +def: InstRW<[SKLWriteResGroup91], (instregex "MUL_F64m")>; +def: InstRW<[SKLWriteResGroup91], (instregex "RCPPSm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "RCPSSm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "RSQRTPSm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "RSQRTSSm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VAESDECLASTrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VAESDECrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VAESENCLASTrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VAESENCrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VRCPPSYm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VRCPPSm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VRCPSSm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VRSQRTPSYm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VRSQRTPSm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VRSQRTSSm")>; + +def SKLWriteResGroup92 : SchedWriteRes<[SKLPort1,SKLPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup92], (instregex "IMUL64r")>; +def: InstRW<[SKLWriteResGroup92], (instregex "MUL64r")>; +def: InstRW<[SKLWriteResGroup92], (instregex "MULX64rr")>; + +def SKLWriteResGroup92_16 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def: InstRW<[SKLWriteResGroup92_16], (instregex "IMUL16r")>; +def: InstRW<[SKLWriteResGroup92_16], (instregex "MUL16r")>; + +def SKLWriteResGroup92_32 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup92_32], (instregex "IMUL32r")>; +def: InstRW<[SKLWriteResGroup92_32], (instregex "MUL32r")>; +def: InstRW<[SKLWriteResGroup92_32], (instregex "MULX32rr")>; + +def SKLWriteResGroup93 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup93], (instregex "VPSLLDYrr")>; +def: InstRW<[SKLWriteResGroup93], (instregex "VPSLLQYrr")>; +def: InstRW<[SKLWriteResGroup93], (instregex "VPSLLWYrr")>; +def: InstRW<[SKLWriteResGroup93], (instregex "VPSRADYrr")>; +def: InstRW<[SKLWriteResGroup93], (instregex "VPSRAWYrr")>; +def: InstRW<[SKLWriteResGroup93], (instregex "VPSRLDYrr")>; +def: InstRW<[SKLWriteResGroup93], (instregex "VPSRLQYrr")>; +def: InstRW<[SKLWriteResGroup93], (instregex "VPSRLWYrr")>; + +def SKLWriteResGroup94 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup94], (instregex "ADDPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "ADDPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "ADDSDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "ADDSSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "ADDSUBPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "ADDSUBPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "MULPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "MULPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "MULSDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "MULSSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SUBPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SUBPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SUBSDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SUBSSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDPDYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDPSYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDSDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDSSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDSUBPDYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDSUBPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDSUBPSYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VADDSUBPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD132PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD132PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD132PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD132PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD132SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD132SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD213PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD213PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD213PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD213PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD213SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD213SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD231PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD231PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD231PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD231PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD231SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADD231SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB132PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB132PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB132PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB132PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB213PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB213PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB213PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB213PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB231PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB231PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB231PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMADDSUB231PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB132PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB132PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB132PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB132PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB132SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB132SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB213PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB213PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB213PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB213PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB213SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB213SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB231PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB231PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB231PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB231PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB231SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUB231SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD132PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD132PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD132PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD132PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD213PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD213PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD213PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD213PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD231PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD231PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD231PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFMSUBADD231PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD132PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD132PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD132PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD132PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD132SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD132SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD213PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD213PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD213PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD213PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD213SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD213SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD231PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD231PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD231PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD231PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD231SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMADD231SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB132PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB132PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB132PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB132PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB132SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB132SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB213PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB213PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB213PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB213PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB213SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB213SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB231PDYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB231PDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB231PSYm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB231PSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB231SDm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VFNMSUB231SSm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VMULPDYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VMULPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VMULPSYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VMULPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VMULSDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VMULSSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VSUBPDYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VSUBPDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VSUBPSYrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VSUBPSrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VSUBSDrm")>; +def: InstRW<[SKLWriteResGroup94], (instregex "VSUBSSrm")>; + +def SKLWriteResGroup95 : SchedWriteRes<[SKLPort23,SKLPort05]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} + +def SKLWriteResGroup96 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup96], (instregex "CMPPDrmi")>; +def: InstRW<[SKLWriteResGroup96], (instregex "CMPPSrmi")>; +def: InstRW<[SKLWriteResGroup96], (instregex "CMPSSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "CVTDQ2PSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "CVTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "CVTPS2PDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "CVTSS2SDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "CVTTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MAXPDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MAXPSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MAXSDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MAXSSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MINPDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MINPSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MINSDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MINSSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMADDUBSWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMADDWDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMULDQrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMULHRSWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMULHUWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMULHWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMULLWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "PMULUDQrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCMPPDYrmi")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCMPPDrmi")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCMPPSYrmi")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCMPPSrmi")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCMPSDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCMPSSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTPH2PSYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTPH2PSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTPS2PDYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTPS2PDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTSS2SDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTTPS2DQYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMAXPDYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMAXPDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMAXPSYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMAXPSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMAXSDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMAXSSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMINPDYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMINPDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMINPSYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMINPSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMINSDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VMINSSrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMADDUBSWYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMADDUBSWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMADDWDYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMADDWDrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULDQYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULDQrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULHRSWYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULHRSWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULHUWYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULHUWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULHWYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULHWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULLWYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULLWrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULUDQYrm")>; +def: InstRW<[SKLWriteResGroup96], (instregex "VPMULUDQrm")>; + +def SKLWriteResGroup97 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup97], (instregex "FICOM16m")>; +def: InstRW<[SKLWriteResGroup97], (instregex "FICOM32m")>; +def: InstRW<[SKLWriteResGroup97], (instregex "FICOMP16m")>; +def: InstRW<[SKLWriteResGroup97], (instregex "FICOMP32m")>; +def: InstRW<[SKLWriteResGroup97], (instregex "MPSADBWrmi")>; +def: InstRW<[SKLWriteResGroup97], (instregex "VMPSADBWYrmi")>; +def: InstRW<[SKLWriteResGroup97], (instregex "VMPSADBWrmi")>; + +def SKLWriteResGroup98 : SchedWriteRes<[SKLPort1,SKLPort5,SKLPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup98], (instregex "MULX64rm")>; + +def SKLWriteResGroup100 : SchedWriteRes<[SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[SKLWriteResGroup100], (instregex "FNCLEX")>; + +def SKLWriteResGroup101 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup101], (instregex "PAUSE")>; + +def SKLWriteResGroup102 : SchedWriteRes<[SKLPort015,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup102], (instregex "VZEROUPPER")>; + +def SKLWriteResGroup103 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup103], (instregex "LAR(16|32|64)rr")>; + +def SKLWriteResGroup105 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup105], (instregex "SHLD(16|32|64)mri8")>; +def: InstRW<[SKLWriteResGroup105], (instregex "SHRD(16|32|64)mri8")>; + +def SKLWriteResGroup106 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKLWriteResGroup106], (instregex "LAR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup106], (instregex "LSL(16|32|64)rm")>; + +def SKLWriteResGroup107 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[SKLWriteResGroup107], (instregex "PUSHF16")>; +def: InstRW<[SKLWriteResGroup107], (instregex "PUSHF64")>; + +def SKLWriteResGroup108 : SchedWriteRes<[SKLPort015]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +def SKLWriteResGroup109 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup109], (instregex "CVTDQ2PDrr")>; +def: InstRW<[SKLWriteResGroup109], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VCVTDQ2PDrr")>; + +def SKLWriteResGroup110 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup110], (instregex "CVTPD2DQrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTPD2PSrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTPS2PDrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTSD2SSrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTSI2SD64rr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTSI2SDrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTSI2SSrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTSS2SDrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "CVTTPD2DQrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTPD2DQrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTPD2PSrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTPH2PSrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTPS2PDrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTPS2PHrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTSD2SSrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTSI2SD64rr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTSI2SDrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTSI2SSrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTSS2SDrr")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VCVTTPD2DQrr")>; + +def SKLWriteResGroup112 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} + +def SKLWriteResGroup113 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup113], (instregex "CVTDQ2PDrm")>; +def: InstRW<[SKLWriteResGroup113], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[SKLWriteResGroup113], (instregex "VCVTDQ2PDrm")>; + +def SKLWriteResGroup114 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup114], (instregex "STR(16|32|64)r")>; + +def SKLWriteResGroup116 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup116], (instregex "CVTPD2DQrm")>; +def: InstRW<[SKLWriteResGroup116], (instregex "CVTPD2PSrm")>; +def: InstRW<[SKLWriteResGroup116], (instregex "CVTSD2SSrm")>; +def: InstRW<[SKLWriteResGroup116], (instregex "CVTTPD2DQrm")>; +def: InstRW<[SKLWriteResGroup116], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[SKLWriteResGroup116], (instregex "MMX_CVTTPD2PIirm")>; +def: InstRW<[SKLWriteResGroup116], (instregex "VCVTSD2SSrm")>; + +def SKLWriteResGroup118 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup118], (instregex "MULX32rm")>; + +def SKLWriteResGroup119 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup119], (instregex "VCVTPS2PHmr")>; + +def SKLWriteResGroup120 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[SKLWriteResGroup120], (instregex "XSETBV")>; + +def SKLWriteResGroup121 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SKLWriteResGroup121], (instregex "CMPXCHG(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup121], (instregex "CMPXCHG8rr")>; + +def SKLWriteResGroup122 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup122], (instregex "ADD8mi")>; +def: InstRW<[SKLWriteResGroup122], (instregex "AND8mi")>; +def: InstRW<[SKLWriteResGroup122], (instregex "OR8mi")>; +def: InstRW<[SKLWriteResGroup122], (instregex "SUB8mi")>; +def: InstRW<[SKLWriteResGroup122], (instregex "XCHG(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "XCHG8rm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "XOR8mi")>; + +def SKLWriteResGroup123 : SchedWriteRes<[SKLPort5]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup123], (instregex "PCLMULQDQrr")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VPCLMULQDQrr")>; + +def SKLWriteResGroup124 : SchedWriteRes<[SKLPort0]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup124], (instregex "MMX_CVTPI2PSirr")>; + +def SKLWriteResGroup125 : SchedWriteRes<[SKLPort0,SKLPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup125], (instregex "CVTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "CVTSD2SIrr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "CVTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "CVTSS2SIrr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "CVTTSD2SIrr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "VCVTSD2SIrr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "VCVTSS2SIrr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup125], (instregex "VCVTTSD2SIrr")>; + +def SKLWriteResGroup126 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup126], (instregex "PCLMULQDQrm")>; +def: InstRW<[SKLWriteResGroup126], (instregex "VPCLMULQDQrm")>; + +def SKLWriteResGroup127 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup127], (instregex "HADDPDrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "HADDPSrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "HSUBPDrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "HSUBPSrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHADDPDYrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHADDPDrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHADDPSYrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHADDPSrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHSUBPDYrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHSUBPDrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHSUBPSYrr")>; +def: InstRW<[SKLWriteResGroup127], (instregex "VHSUBPSrr")>; + +def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup128], (instregex "ADD_FI16m")>; +def: InstRW<[SKLWriteResGroup128], (instregex "ADD_FI32m")>; +def: InstRW<[SKLWriteResGroup128], (instregex "SUBR_FI16m")>; +def: InstRW<[SKLWriteResGroup128], (instregex "SUBR_FI32m")>; +def: InstRW<[SKLWriteResGroup128], (instregex "SUB_FI16m")>; +def: InstRW<[SKLWriteResGroup128], (instregex "SUB_FI32m")>; + +def SKLWriteResGroup129 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup129], (instregex "CVTSI2SS64rr")>; +def: InstRW<[SKLWriteResGroup129], (instregex "VCVTSI2SS64rr")>; + +def SKLWriteResGroup130 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort015]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup130], (instregex "CVTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "CVTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "CVTSS2SI64rm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "CVTSS2SIrm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "CVTTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "CVTTSS2SIrm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTSS2SIrm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[SKLWriteResGroup130], (instregex "VCVTTSS2SIrm")>; + +def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKLWriteResGroup131], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[SKLWriteResGroup131], (instregex "SHRD(16|32|64)rrCL")>; + +def SKLWriteResGroup133 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup133], (instregex "HADDPDrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "HADDPSrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "HSUBPDrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "HSUBPSrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHADDPDYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHADDPDrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHADDPSYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHADDPSrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHSUBPDYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHSUBPDrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHSUBPSYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VHSUBPSrm")>; + +def SKLWriteResGroup134 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup134], (instregex "SLDT(16|32|64)r")>; + +def SKLWriteResGroup136 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[SKLWriteResGroup136], (instregex "STD")>; + +def SKLWriteResGroup137 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[SKLWriteResGroup137], (instregex "SHLD(16|32|64)mrCL")>; +def: InstRW<[SKLWriteResGroup137], (instregex "SHRD(16|32|64)mrCL")>; + +def SKLWriteResGroup140 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 19; + let ResourceCycles = [1,8,8,2]; +} + +def SKLWriteResGroup141 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort237,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 36; + let ResourceCycles = [1,16,1,16,2]; +} + +def SKLWriteResGroup142 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup142], (instregex "VCVTDQ2PDYrr")>; + +def SKLWriteResGroup143 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup143], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[SKLWriteResGroup143], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[SKLWriteResGroup143], (instregex "VCVTPH2PSYrr")>; +def: InstRW<[SKLWriteResGroup143], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[SKLWriteResGroup143], (instregex "VCVTPS2PHYrr")>; +def: InstRW<[SKLWriteResGroup143], (instregex "VCVTTPD2DQYrr")>; + +def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup145], (instregex "MUL_FI16m")>; +def: InstRW<[SKLWriteResGroup145], (instregex "MUL_FI32m")>; +def: InstRW<[SKLWriteResGroup145], (instregex "VCVTDQ2PDYrm")>; + +def SKLWriteResGroup146 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup146], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup146], (instregex "CVTTSS2SIrr")>; +def: InstRW<[SKLWriteResGroup146], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup146], (instregex "VCVTTSS2SIrr")>; + +def SKLWriteResGroup147 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} + +def SKLWriteResGroup149 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup149], (instregex "CVTTSS2SI64rm")>; + +def SKLWriteResGroup150 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup150], (instregex "VCVTPS2PHYmr")>; + +def SKLWriteResGroup151 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [1,3,1,2]; +} +def: InstRW<[SKLWriteResGroup151], (instregex "LOOP")>; + +def SKLWriteResGroup156 : SchedWriteRes<[SKLPort0]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup156], (instregex "AESIMCrr")>; +def: InstRW<[SKLWriteResGroup156], (instregex "VAESIMCrr")>; + +def SKLWriteResGroup157 : SchedWriteRes<[SKLPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup157], (instregex "PMULLDrr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "ROUNDPDr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "ROUNDPSr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "ROUNDSDr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "ROUNDSSr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VPMULLDYrr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VPMULLDrr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VROUNDPDr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VROUNDPSr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VROUNDSDr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VROUNDSSr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VROUNDYPDr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VROUNDYPSr")>; + +def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup160], (instregex "AESIMCrm")>; +def: InstRW<[SKLWriteResGroup160], (instregex "VAESIMCrm")>; + +def SKLWriteResGroup161 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup161], (instregex "PMULLDrm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "ROUNDPDm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "ROUNDPSm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "ROUNDSDm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "ROUNDSSm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VPMULLDYrm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VPMULLDrm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VROUNDPDm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VROUNDPSm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VROUNDSDm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VROUNDSSm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VROUNDYPDm")>; +def: InstRW<[SKLWriteResGroup161], (instregex "VROUNDYPSm")>; + +def SKLWriteResGroup165 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup165], (instregex "DPPDrri")>; +def: InstRW<[SKLWriteResGroup165], (instregex "VDPPDrri")>; + +def SKLWriteResGroup167 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup167], (instregex "DPPDrmi")>; +def: InstRW<[SKLWriteResGroup167], (instregex "VDPPDrmi")>; + +def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup169], (instregex "PCMPISTRIrr")>; +def: InstRW<[SKLWriteResGroup169], (instregex "PCMPISTRM128rr")>; +def: InstRW<[SKLWriteResGroup169], (instregex "VPCMPISTRIrr")>; +def: InstRW<[SKLWriteResGroup169], (instregex "VPCMPISTRM128rr")>; + +def SKLWriteResGroup170 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} +def: InstRW<[SKLWriteResGroup170], (instregex "PCMPISTRIrm")>; +def: InstRW<[SKLWriteResGroup170], (instregex "PCMPISTRM128rm")>; +def: InstRW<[SKLWriteResGroup170], (instregex "VPCMPISTRIrm")>; +def: InstRW<[SKLWriteResGroup170], (instregex "VPCMPISTRM128rm")>; + +def SKLWriteResGroup171 : SchedWriteRes<[SKLPort05,SKLPort0156]> { + let Latency = 10; + let NumMicroOps = 10; + let ResourceCycles = [9,1]; +} +def: InstRW<[SKLWriteResGroup171], (instregex "MMX_EMMS")>; + +def SKLWriteResGroup172 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 10; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,5,1,1]; +} +def: InstRW<[SKLWriteResGroup172], (instregex "RCL(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup172], (instregex "RCL8mCL")>; + +def SKLWriteResGroup173 : SchedWriteRes<[SKLPort0]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup173], (instregex "DIVPSrr")>; +def: InstRW<[SKLWriteResGroup173], (instregex "DIVSSrr")>; +def: InstRW<[SKLWriteResGroup173], (instregex "VDIVPSYrr")>; +def: InstRW<[SKLWriteResGroup173], (instregex "VDIVPSrr")>; +def: InstRW<[SKLWriteResGroup173], (instregex "VDIVSSrr")>; + +def SKLWriteResGroup174 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup174], (instregex "DIVPSrm")>; +def: InstRW<[SKLWriteResGroup174], (instregex "DIVSSrm")>; +def: InstRW<[SKLWriteResGroup174], (instregex "VDIVPSYrm")>; +def: InstRW<[SKLWriteResGroup174], (instregex "VDIVPSrm")>; +def: InstRW<[SKLWriteResGroup174], (instregex "VDIVSSrm")>; + +def SKLWriteResGroup175 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,3,2]; +} +def: InstRW<[SKLWriteResGroup175], (instregex "RCL(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup175], (instregex "RCR(16|32|64)rCL")>; + +def SKLWriteResGroup176 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,5,1,2]; +} +def: InstRW<[SKLWriteResGroup176], (instregex "RCL8rCL")>; + +def SKLWriteResGroup177 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[SKLWriteResGroup177], (instregex "LOOPE")>; +def: InstRW<[SKLWriteResGroup177], (instregex "LOOPNE")>; + +def SKLWriteResGroup178 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[SKLWriteResGroup178], (instregex "CMPXCHG8B")>; + +def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0]> { + let Latency = 12; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup179], (instregex "SQRTPSr")>; +def: InstRW<[SKLWriteResGroup179], (instregex "SQRTSSr")>; +def: InstRW<[SKLWriteResGroup179], (instregex "VSQRTPSYr")>; +def: InstRW<[SKLWriteResGroup179], (instregex "VSQRTPSr")>; +def: InstRW<[SKLWriteResGroup179], (instregex "VSQRTSSr")>; + +def SKLWriteResGroup180 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup180], (instregex "SQRTPSm")>; +def: InstRW<[SKLWriteResGroup180], (instregex "SQRTSSm")>; +def: InstRW<[SKLWriteResGroup180], (instregex "VSQRTPSYm")>; +def: InstRW<[SKLWriteResGroup180], (instregex "VSQRTPSm")>; +def: InstRW<[SKLWriteResGroup180], (instregex "VSQRTSSm")>; + +def SKLWriteResGroup187 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup187], (instregex "DPPSrri")>; +def: InstRW<[SKLWriteResGroup187], (instregex "VDPPSYrri")>; +def: InstRW<[SKLWriteResGroup187], (instregex "VDPPSrri")>; + +def SKLWriteResGroup188 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 13; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup188], (instregex "DPPSrmi")>; +def: InstRW<[SKLWriteResGroup188], (instregex "VDPPSYrmi")>; +def: InstRW<[SKLWriteResGroup188], (instregex "VDPPSrmi")>; + +def SKLWriteResGroup189 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 13; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,4,1,2]; +} +def: InstRW<[SKLWriteResGroup189], (instregex "RCR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup189], (instregex "RCR8mCL")>; + +def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup190], (instregex "DIVPDrr")>; +def: InstRW<[SKLWriteResGroup190], (instregex "DIVSDrr")>; +def: InstRW<[SKLWriteResGroup190], (instregex "VDIVPDYrr")>; +def: InstRW<[SKLWriteResGroup190], (instregex "VDIVPDrr")>; +def: InstRW<[SKLWriteResGroup190], (instregex "VDIVSDrr")>; + +def SKLWriteResGroup191 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup191], (instregex "DIVPDrm")>; +def: InstRW<[SKLWriteResGroup191], (instregex "DIVSDrm")>; +def: InstRW<[SKLWriteResGroup191], (instregex "VDIVPDYrm")>; +def: InstRW<[SKLWriteResGroup191], (instregex "VDIVPDrm")>; +def: InstRW<[SKLWriteResGroup191], (instregex "VDIVSDrm")>; + +def SKLWriteResGroup192 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,4,1,3]; +} +def: InstRW<[SKLWriteResGroup192], (instregex "RCR8rCL")>; + +def SKLWriteResGroup193 : SchedWriteRes<[SKLPort0]> { + let Latency = 15; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup193], (instregex "DIVR_FPrST0")>; +def: InstRW<[SKLWriteResGroup193], (instregex "DIVR_FST0r")>; +def: InstRW<[SKLWriteResGroup193], (instregex "DIVR_FrST0")>; + +def SKLWriteResGroup194 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 15; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup194], (instregex "DIV_F32m")>; +def: InstRW<[SKLWriteResGroup194], (instregex "DIV_F64m")>; + +def SKLWriteResGroup195 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 15; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup195], (instregex "INSB")>; +def: InstRW<[SKLWriteResGroup195], (instregex "INSL")>; +def: InstRW<[SKLWriteResGroup195], (instregex "INSW")>; + +def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0156]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[SKLWriteResGroup196], (instregex "VZEROALL")>; + +def SKLWriteResGroup197 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[SKLWriteResGroup197], (instregex "XCH_F")>; + +def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0]> { + let Latency = 18; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup198], (instregex "SQRTPDr")>; +def: InstRW<[SKLWriteResGroup198], (instregex "SQRTSDr")>; +def: InstRW<[SKLWriteResGroup198], (instregex "VSQRTPDYr")>; +def: InstRW<[SKLWriteResGroup198], (instregex "VSQRTPDr")>; +def: InstRW<[SKLWriteResGroup198], (instregex "VSQRTSDr")>; + +def SKLWriteResGroup199 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 18; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup199], (instregex "SQRTPDm")>; +def: InstRW<[SKLWriteResGroup199], (instregex "SQRTSDm")>; +def: InstRW<[SKLWriteResGroup199], (instregex "VSQRTPDYm")>; +def: InstRW<[SKLWriteResGroup199], (instregex "VSQRTPDm")>; +def: InstRW<[SKLWriteResGroup199], (instregex "VSQRTSDm")>; + +def SKLWriteResGroup200 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 18; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup200], (instregex "DIV_FI16m")>; +def: InstRW<[SKLWriteResGroup200], (instregex "DIV_FI32m")>; + +def SKLWriteResGroup201 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def: InstRW<[SKLWriteResGroup201], (instregex "PCMPESTRIrr")>; +def: InstRW<[SKLWriteResGroup201], (instregex "VPCMPESTRIrr")>; + +def SKLWriteResGroup202 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[SKLWriteResGroup202], (instregex "CPUID")>; +def: InstRW<[SKLWriteResGroup202], (instregex "RDTSC")>; + +def SKLWriteResGroup203 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[SKLWriteResGroup203], (instregex "PCMPESTRIrm")>; +def: InstRW<[SKLWriteResGroup203], (instregex "VPCMPESTRIrm")>; + +def SKLWriteResGroup204 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[SKLWriteResGroup204], (instregex "CMPXCHG16B")>; + +def SKLWriteResGroup205 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015,SKLPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[SKLWriteResGroup205], (instregex "PCMPESTRM128rr")>; +def: InstRW<[SKLWriteResGroup205], (instregex "VPCMPESTRM128rr")>; + +def SKLWriteResGroup206 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015,SKLPort0156]> { + let Latency = 19; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} +def: InstRW<[SKLWriteResGroup206], (instregex "PCMPESTRM128rm")>; +def: InstRW<[SKLWriteResGroup206], (instregex "VPCMPESTRM128rm")>; + +def SKLWriteResGroup207 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 19; + let NumMicroOps = 11; + let ResourceCycles = [3,6,1,1]; +} +def: InstRW<[SKLWriteResGroup207], (instregex "AESKEYGENASSIST128rm")>; +def: InstRW<[SKLWriteResGroup207], (instregex "VAESKEYGENASSIST128rm")>; + +def SKLWriteResGroup208 : SchedWriteRes<[SKLPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup208], (instregex "DIV_FPrST0")>; +def: InstRW<[SKLWriteResGroup208], (instregex "DIV_FST0r")>; +def: InstRW<[SKLWriteResGroup208], (instregex "DIV_FrST0")>; + +def SKLWriteResGroup209 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup209], (instregex "DIVR_F32m")>; +def: InstRW<[SKLWriteResGroup209], (instregex "DIVR_F64m")>; + +def SKLWriteResGroup210 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; +} +def: InstRW<[SKLWriteResGroup210], (instregex "MWAITrr")>; + +def SKLWriteResGroup211 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> { + let Latency = 20; + let NumMicroOps = 11; + let ResourceCycles = [3,6,2]; +} +def: InstRW<[SKLWriteResGroup211], (instregex "AESKEYGENASSIST128rr")>; +def: InstRW<[SKLWriteResGroup211], (instregex "VAESKEYGENASSIST128rr")>; + +def SKLWriteResGroup215 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup215], (instregex "DIVR_FI16m")>; +def: InstRW<[SKLWriteResGroup215], (instregex "DIVR_FI32m")>; + +def SKLWriteResGroup217 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> { + let Latency = 23; + let NumMicroOps = 8; + let ResourceCycles = [2,4,1,1]; +} +def: InstRW<[SKLWriteResGroup217], (instregex "IDIV(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup217], (instregex "IDIV8m")>; + +def SKLWriteResGroup222 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort06,SKLPort0156]> { + let Latency = 30; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[SKLWriteResGroup222], (instregex "IN32ri")>; +def: InstRW<[SKLWriteResGroup222], (instregex "IN32rr")>; +def: InstRW<[SKLWriteResGroup222], (instregex "IN8ri")>; +def: InstRW<[SKLWriteResGroup222], (instregex "IN8rr")>; + +def SKLWriteResGroup223 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 30; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[SKLWriteResGroup223], (instregex "OUT32ir")>; +def: InstRW<[SKLWriteResGroup223], (instregex "OUT32rr")>; +def: InstRW<[SKLWriteResGroup223], (instregex "OUT8ir")>; +def: InstRW<[SKLWriteResGroup223], (instregex "OUT8rr")>; + +def SKLWriteResGroup224 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 32; + let NumMicroOps = 31; + let ResourceCycles = [1,8,1,21]; +} +def: InstRW<[SKLWriteResGroup224], (instregex "XRSTOR(64?)")>; + +def SKLWriteResGroup225 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort15,SKLPort0156]> { + let Latency = 35; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[SKLWriteResGroup225], (instregex "VMCLEARm")>; + +def SKLWriteResGroup226 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 36; + let NumMicroOps = 39; + let ResourceCycles = [1,10,1,1,26]; +} +def: InstRW<[SKLWriteResGroup226], (instregex "XSAVE64")>; + +def SKLWriteResGroup231 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 37; + let NumMicroOps = 40; + let ResourceCycles = [1,11,1,1,26]; +} +def: InstRW<[SKLWriteResGroup231], (instregex "XSAVE")>; + +def SKLWriteResGroup232 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 41; + let NumMicroOps = 44; + let ResourceCycles = [1,11,1,1,30]; +} +def: InstRW<[SKLWriteResGroup232], (instregex "XSAVEOPT")>; + +def SKLWriteResGroup233 : SchedWriteRes<[SKLPort5,SKLPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[SKLWriteResGroup233], (instregex "RDTSCP")>; + +def SKLWriteResGroup234 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05,SKLPort06,SKLPort0156]> { + let Latency = 57; + let NumMicroOps = 64; + let ResourceCycles = [2,8,5,10,39]; +} +def: InstRW<[SKLWriteResGroup234], (instregex "FLDENVm")>; +def: InstRW<[SKLWriteResGroup234], (instregex "FLDENVm")>; + +def SKLWriteResGroup235 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 58; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[SKLWriteResGroup235], (instregex "FXRSTOR64")>; + +def SKLWriteResGroup236 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 58; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[SKLWriteResGroup236], (instregex "FXRSTOR")>; + +def SKLWriteResGroup239 : SchedWriteRes<[SKLPort5,SKLPort05,SKLPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[SKLWriteResGroup239], (instregex "FNINIT")>; + +def SKLWriteResGroup240 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> { + let Latency = 76; + let NumMicroOps = 32; + let ResourceCycles = [7,2,8,3,1,11]; +} +def: InstRW<[SKLWriteResGroup240], (instregex "DIV(16|32|64)r")>; + +def SKLWriteResGroup241 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 102; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[SKLWriteResGroup241], (instregex "IDIV(16|32|64)r")>; + +def SKLWriteResGroup242 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 105; + let NumMicroOps = 100; + let ResourceCycles = [9,1,11,16,1,11,21,30]; +} +def: InstRW<[SKLWriteResGroup242], (instregex "FSTENVm")>; +def: InstRW<[SKLWriteResGroup242], (instregex "FSTENVm")>; + +} // SchedModel Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -666,4 +666,4 @@ include "X86ScheduleSLM.td" include "X86ScheduleZnver1.td" include "X86ScheduleBtVer2.td" - +include "X86SchedSkylakeClient.td" Index: test/CodeGen/X86/aes-schedule.ll =================================================================== --- test/CodeGen/X86/aes-schedule.ll +++ test/CodeGen/X86/aes-schedule.ll @@ -36,8 +36,8 @@ ; ; SKYLAKE-LABEL: test_aesdec: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00] -; SKYLAKE-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SKYLAKE-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_aesdec: @@ -85,8 +85,8 @@ ; ; SKYLAKE-LABEL: test_aesdeclast: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00] -; SKYLAKE-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SKYLAKE-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_aesdeclast: @@ -134,8 +134,8 @@ ; ; SKYLAKE-LABEL: test_aesenc: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00] -; SKYLAKE-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SKYLAKE-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_aesenc: @@ -183,8 +183,8 @@ ; ; SKYLAKE-LABEL: test_aesenclast: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00] -; SKYLAKE-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SKYLAKE-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_aesenclast: @@ -236,8 +236,8 @@ ; ; SKYLAKE-LABEL: test_aesimc: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaesimc %xmm0, %xmm0 # sched: [14:2.00] -; SKYLAKE-NEXT: vaesimc (%rdi), %xmm1 # sched: [14:2.00] +; SKYLAKE-NEXT: vaesimc %xmm0, %xmm0 # sched: [8:2.00] +; SKYLAKE-NEXT: vaesimc (%rdi), %xmm1 # sched: [8:2.00] ; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -293,8 +293,8 @@ ; ; SKYLAKE-LABEL: test_aeskeygenassist: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [29:7.00] -; SKYLAKE-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [28:7.00] +; SKYLAKE-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [20:6.00] +; SKYLAKE-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [19:6.00] ; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; Index: test/CodeGen/X86/avx-schedule.ll =================================================================== --- test/CodeGen/X86/avx-schedule.ll +++ test/CodeGen/X86/avx-schedule.ll @@ -28,8 +28,8 @@ ; ; SKYLAKE-LABEL: test_addpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addpd: @@ -70,8 +70,8 @@ ; ; SKYLAKE-LABEL: test_addps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addps: @@ -112,8 +112,8 @@ ; ; SKYLAKE-LABEL: test_addsubpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubpd: @@ -155,8 +155,8 @@ ; ; SKYLAKE-LABEL: test_addsubps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubps: @@ -201,9 +201,9 @@ ; ; SKYLAKE-LABEL: test_andnotpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotpd: @@ -256,9 +256,9 @@ ; ; SKYLAKE-LABEL: test_andnotps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotps: @@ -311,9 +311,9 @@ ; ; SKYLAKE-LABEL: test_andpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andpd: @@ -364,9 +364,9 @@ ; ; SKYLAKE-LABEL: test_andps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andps: @@ -418,7 +418,7 @@ ; SKYLAKE-LABEL: test_blendpd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33] -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -505,8 +505,8 @@ ; ; SKYLAKE-LABEL: test_blendvpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] -; SKYLAKE-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00] +; SKYLAKE-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67] +; SKYLAKE-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:0.67] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvpd: @@ -548,8 +548,8 @@ ; ; SKYLAKE-LABEL: test_blendvps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] -; SKYLAKE-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00] +; SKYLAKE-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67] +; SKYLAKE-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:0.67] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvps: @@ -737,9 +737,9 @@ ; ; SKYLAKE-LABEL: test_cmppd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [4:0.33] +; SKYLAKE-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmppd: @@ -789,9 +789,9 @@ ; ; SKYLAKE-LABEL: test_cmpps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [4:0.33] +; SKYLAKE-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmpps: @@ -841,9 +841,9 @@ ; ; SKYLAKE-LABEL: test_cvtdq2pd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00] -; SKYLAKE-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [6:1.00] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00] +; SKYLAKE-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [7:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2pd: @@ -892,9 +892,9 @@ ; ; SKYLAKE-LABEL: test_cvtdq2ps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2ps: @@ -941,7 +941,7 @@ ; ; SKYLAKE-LABEL: test_cvtpd2dq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00] +; SKYLAKE-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00] ; SKYLAKE-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00] ; SKYLAKE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] @@ -990,7 +990,7 @@ ; ; SKYLAKE-LABEL: test_cvtpd2ps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00] +; SKYLAKE-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00] ; SKYLAKE-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00] ; SKYLAKE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] @@ -1039,9 +1039,9 @@ ; ; SKYLAKE-LABEL: test_cvtps2dq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtps2dq: @@ -1085,8 +1085,8 @@ ; ; SKYLAKE-LABEL: test_divpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [35:2.00] -; SKYLAKE-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [35:2.00] +; SKYLAKE-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [14:1.00] +; SKYLAKE-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [14:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divpd: @@ -1127,8 +1127,8 @@ ; ; SKYLAKE-LABEL: test_divps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [21:2.00] -; SKYLAKE-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [21:2.00] +; SKYLAKE-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [11:1.00] +; SKYLAKE-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divps: @@ -1169,8 +1169,8 @@ ; ; SKYLAKE-LABEL: test_dpps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00] -; SKYLAKE-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00] +; SKYLAKE-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [13:1.33] +; SKYLAKE-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [13:1.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_dpps: @@ -1259,8 +1259,8 @@ ; ; SKYLAKE-LABEL: test_haddpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddpd: @@ -1302,8 +1302,8 @@ ; ; SKYLAKE-LABEL: test_haddps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddps: @@ -1345,8 +1345,8 @@ ; ; SKYLAKE-LABEL: test_hsubpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubpd: @@ -1388,8 +1388,8 @@ ; ; SKYLAKE-LABEL: test_hsubps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubps: @@ -1436,7 +1436,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00] ; SKYLAKE-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_insertf128: @@ -1520,8 +1520,8 @@ ; ; SKYLAKE-LABEL: test_maskmovpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [2:2.00] -; SKYLAKE-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:1.00] +; SKYLAKE-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [1:0.50] +; SKYLAKE-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1569,8 +1569,8 @@ ; ; SKYLAKE-LABEL: test_maskmovpd_ymm: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [2:2.00] -; SKYLAKE-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [4:1.00] +; SKYLAKE-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [1:0.50] +; SKYLAKE-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1618,8 +1618,8 @@ ; ; SKYLAKE-LABEL: test_maskmovps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [2:2.00] -; SKYLAKE-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:1.00] +; SKYLAKE-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [1:0.50] +; SKYLAKE-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1667,8 +1667,8 @@ ; ; SKYLAKE-LABEL: test_maskmovps_ymm: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [2:2.00] -; SKYLAKE-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [4:1.00] +; SKYLAKE-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50] +; SKYLAKE-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1713,8 +1713,8 @@ ; ; SKYLAKE-LABEL: test_maxpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxpd: @@ -1756,8 +1756,8 @@ ; ; SKYLAKE-LABEL: test_maxps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxps: @@ -1799,8 +1799,8 @@ ; ; SKYLAKE-LABEL: test_minpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minpd: @@ -1842,8 +1842,8 @@ ; ; SKYLAKE-LABEL: test_minps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minps: @@ -1889,7 +1889,7 @@ ; SKYLAKE-LABEL: test_movapd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovapd (%rdi), %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1937,7 +1937,7 @@ ; SKYLAKE-LABEL: test_movaps: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovaps (%rdi), %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1986,7 +1986,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00] ; SKYLAKE-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [1:0.50] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movddup: @@ -2030,7 +2030,7 @@ ; ; SKYLAKE-LABEL: test_movmskpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmovmskpd %ymm0, %eax # sched: [3:1.00] +; SKYLAKE-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00] ; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2070,7 +2070,7 @@ ; ; SKYLAKE-LABEL: test_movmskps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00] +; SKYLAKE-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00] ; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2110,7 +2110,7 @@ ; ; SKYLAKE-LABEL: test_movntpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2151,7 +2151,7 @@ ; ; SKYLAKE-LABEL: test_movntps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2197,7 +2197,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00] ; SKYLAKE-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movshdup: @@ -2246,7 +2246,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00] ; SKYLAKE-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movsldup: @@ -2296,7 +2296,7 @@ ; SKYLAKE-LABEL: test_movupd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovupd (%rdi), %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2346,7 +2346,7 @@ ; SKYLAKE-LABEL: test_movups: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovups (%rdi), %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2390,8 +2390,8 @@ ; ; SKYLAKE-LABEL: test_mulpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:0.50] -; SKYLAKE-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SKYLAKE-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulpd: @@ -2432,8 +2432,8 @@ ; ; SKYLAKE-LABEL: test_mulps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] -; SKYLAKE-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SKYLAKE-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulps: @@ -2477,9 +2477,9 @@ ; ; SKYLAKE-LABEL: orpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: orpd: @@ -2530,9 +2530,9 @@ ; ; SKYLAKE-LABEL: test_orps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_orps: @@ -2585,7 +2585,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00] ; SKYLAKE-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilpd: @@ -2634,7 +2634,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00] ; SKYLAKE-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilpd_ymm: @@ -2683,7 +2683,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00] ; SKYLAKE-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilps: @@ -2732,7 +2732,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] ; SKYLAKE-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilps_ymm: @@ -2951,9 +2951,9 @@ ; ; SKYLAKE-LABEL: test_rcpps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00] -; SKYLAKE-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vrcpps (%rdi), %ymm1 # sched: [4:1.00] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rcpps: @@ -3001,9 +3001,9 @@ ; ; SKYLAKE-LABEL: test_roundpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [5:1.25] -; SKYLAKE-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [6:2.00] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:0.67] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundpd: @@ -3051,9 +3051,9 @@ ; ; SKYLAKE-LABEL: test_roundps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [5:1.25] -; SKYLAKE-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [6:2.00] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:0.67] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundps: @@ -3101,9 +3101,9 @@ ; ; SKYLAKE-LABEL: test_rsqrtps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00] -; SKYLAKE-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [11:2.00] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [4:1.00] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rsqrtps: @@ -3153,7 +3153,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00] ; SKYLAKE-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_shufpd: @@ -3242,9 +3242,9 @@ ; ; SKYLAKE-LABEL: test_sqrtpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [35:2.00] -; SKYLAKE-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [35:2.00] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [18:1.00] +; SKYLAKE-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [18:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtpd: @@ -3292,9 +3292,9 @@ ; ; SKYLAKE-LABEL: test_sqrtps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsqrtps (%rdi), %ymm1 # sched: [21:2.00] -; SKYLAKE-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:2.00] -; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsqrtps %ymm0, %ymm0 # sched: [12:1.00] +; SKYLAKE-NEXT: vsqrtps (%rdi), %ymm1 # sched: [12:1.00] +; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtps: @@ -3339,8 +3339,8 @@ ; ; SKYLAKE-LABEL: test_subpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subpd: @@ -3381,8 +3381,8 @@ ; ; SKYLAKE-LABEL: test_subps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subps: @@ -3433,10 +3433,10 @@ ; SKYLAKE-LABEL: test_testpd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25] -; SKYLAKE-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vtestpd %xmm1, %xmm0 # sched: [2:1.00] ; SKYLAKE-NEXT: setb %al # sched: [1:0.50] -; SKYLAKE-NEXT: vtestpd (%rdi), %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: adcl $0, %eax # sched: [2:0.50] +; SKYLAKE-NEXT: vtestpd (%rdi), %xmm0 # sched: [2:1.00] +; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_testpd: @@ -3498,10 +3498,10 @@ ; SKYLAKE-LABEL: test_testpd_ymm: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25] -; SKYLAKE-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vtestpd %ymm1, %ymm0 # sched: [2:1.00] ; SKYLAKE-NEXT: setb %al # sched: [1:0.50] -; SKYLAKE-NEXT: vtestpd (%rdi), %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: adcl $0, %eax # sched: [2:0.50] +; SKYLAKE-NEXT: vtestpd (%rdi), %ymm0 # sched: [2:1.00] +; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50] ; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -3562,10 +3562,10 @@ ; SKYLAKE-LABEL: test_testps: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25] -; SKYLAKE-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vtestps %xmm1, %xmm0 # sched: [2:1.00] ; SKYLAKE-NEXT: setb %al # sched: [1:0.50] -; SKYLAKE-NEXT: vtestps (%rdi), %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: adcl $0, %eax # sched: [2:0.50] +; SKYLAKE-NEXT: vtestps (%rdi), %xmm0 # sched: [2:1.00] +; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_testps: @@ -3627,10 +3627,10 @@ ; SKYLAKE-LABEL: test_testps_ymm: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25] -; SKYLAKE-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vtestps %ymm1, %ymm0 # sched: [2:1.00] ; SKYLAKE-NEXT: setb %al # sched: [1:0.50] -; SKYLAKE-NEXT: vtestps (%rdi), %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: adcl $0, %eax # sched: [2:0.50] +; SKYLAKE-NEXT: vtestps (%rdi), %ymm0 # sched: [2:1.00] +; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50] ; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -3686,7 +3686,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SKYLAKE-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpckhpd: @@ -3777,7 +3777,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] ; SKYLAKE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpcklpd: @@ -3866,9 +3866,9 @@ ; ; SKYLAKE-LABEL: test_xorpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorpd: @@ -3919,9 +3919,9 @@ ; ; SKYLAKE-LABEL: test_xorps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorps: @@ -3966,7 +3966,7 @@ ; ; SKYLAKE-LABEL: test_zeroall: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vzeroall # sched: [16:16.00] +; SKYLAKE-NEXT: vzeroall # sched: [16:4.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_zeroall: Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1619,10 +1619,10 @@ ; ; AVX512VL-LABEL: test_gather_mask: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda] -; AVX512VL-NEXT: vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88] ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] +; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] +; AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda] +; AVX512VL-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89] ; AVX512VL-NEXT: vmovups %ymm2, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10] ; AVX512VL-NEXT: retl ## encoding: [0xc3] %a_i8 = bitcast float* %a to i8* Index: test/CodeGen/X86/avx2-schedule.ll =================================================================== --- test/CodeGen/X86/avx2-schedule.ll +++ test/CodeGen/X86/avx2-schedule.ll @@ -127,7 +127,7 @@ ; ; SKYLAKE-LABEL: test_paddb: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -157,7 +157,7 @@ ; ; SKYLAKE-LABEL: test_paddd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -187,7 +187,7 @@ ; ; SKYLAKE-LABEL: test_paddq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -217,7 +217,7 @@ ; ; SKYLAKE-LABEL: test_paddw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -251,7 +251,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pand: @@ -286,7 +286,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pandn: @@ -319,8 +319,8 @@ ; ; SKYLAKE-LABEL: test_pmulld: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00] -; SKYLAKE-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [10:2.00] +; SKYLAKE-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [8:0.67] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pmulld: @@ -349,8 +349,8 @@ ; ; SKYLAKE-LABEL: test_pmullw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pmullw: @@ -383,7 +383,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_por: @@ -414,7 +414,7 @@ ; ; SKYLAKE-LABEL: test_psubb: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -444,7 +444,7 @@ ; ; SKYLAKE-LABEL: test_psubd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -474,7 +474,7 @@ ; ; SKYLAKE-LABEL: test_psubq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -504,7 +504,7 @@ ; ; SKYLAKE-LABEL: test_psubw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -538,7 +538,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pxor: Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -124,16 +124,27 @@ } define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { -; ALL-LABEL: test8: -; ALL: ## BB#0: -; ALL-NEXT: xorl $-2147483648, %esi ## imm = 0x80000000 -; ALL-NEXT: testl %edx, %edx -; ALL-NEXT: movl $1, %eax -; ALL-NEXT: cmovel %eax, %edx -; ALL-NEXT: notl %edi -; ALL-NEXT: orl %edi, %esi -; ALL-NEXT: cmovnel %edx, %eax -; ALL-NEXT: retq +; KNL-LABEL: test8: +; KNL: ## BB#0: +; KNL-NEXT: xorl $-2147483648, %esi ## imm = 0x80000000 +; KNL-NEXT: testl %edx, %edx +; KNL-NEXT: movl $1, %eax +; KNL-NEXT: cmovel %eax, %edx +; KNL-NEXT: notl %edi +; KNL-NEXT: orl %edi, %esi +; KNL-NEXT: cmovnel %edx, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test8: +; SKX: ## BB#0: +; SKX-NEXT: notl %edi +; SKX-NEXT: xorl $-2147483648, %esi ## imm = 0x80000000 +; SKX-NEXT: testl %edx, %edx +; SKX-NEXT: movl $1, %eax +; SKX-NEXT: cmovel %eax, %edx +; SKX-NEXT: orl %edi, %esi +; SKX-NEXT: cmovnel %edx, %eax +; SKX-NEXT: retq %tmp1 = icmp eq i32 %a1, -1 %tmp2 = icmp eq i32 %a2, -2147483648 %tmp3 = and i1 %tmp1, %tmp2 Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -936,6 +936,7 @@ ; KNL-NEXT: subq $32, %rsp ; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 @@ -1061,7 +1062,6 @@ ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; KNL-NEXT: setb %al ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 @@ -1073,9 +1073,9 @@ ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -945,11 +945,11 @@ ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: movl $1, %eax ; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 @@ -1061,13 +1061,13 @@ ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: cmpl %edx, %esi ; KNL-NEXT: setg %al ; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 @@ -1836,201 +1836,201 @@ ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $32, %rsp ; KNL-NEXT: vmovups 64(%rdi), %zmm2 -; KNL-NEXT: vcmpltps %zmm1, %zmm2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k0 +; KNL-NEXT: vcmpltps %zmm1, %zmm2, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: kshiftlw $15, %k2, %k0 +; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: vmovd %ecx, %xmm2 ; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $13, %k2, %k0 +; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $12, %k2, %k0 +; KNL-NEXT: kshiftlw $12, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $11, %k2, %k0 +; KNL-NEXT: kshiftlw $11, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $10, %k2, %k0 +; KNL-NEXT: kshiftlw $10, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $9, %k2, %k0 +; KNL-NEXT: kshiftlw $9, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $8, %k2, %k0 +; KNL-NEXT: kshiftlw $8, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $7, %k2, %k0 +; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $6, %k2, %k0 +; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $5, %k2, %k0 +; KNL-NEXT: kshiftlw $5, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $4, %k2, %k0 +; KNL-NEXT: kshiftlw $4, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $3, %k2, %k0 +; KNL-NEXT: kshiftlw $3, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $2, %k2, %k0 +; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftlw $1, %k2, %k0 +; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; KNL-NEXT: kshiftrw $15, %k2, %k0 +; KNL-NEXT: vmovups (%rdi), %zmm3 +; KNL-NEXT: kshiftrw $15, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; KNL-NEXT: vmovups (%rdi), %zmm3 -; KNL-NEXT: vcmpltps %zmm0, %zmm3, %k1 -; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: vcmpltps %zmm0, %zmm3, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: kshiftlw $15, %k1, %k0 +; KNL-NEXT: kshiftlw $15, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: vmovd %ecx, %xmm3 ; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftlw $13, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $12, %k1, %k0 +; KNL-NEXT: kshiftlw $12, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $11, %k1, %k0 +; KNL-NEXT: kshiftlw $11, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $10, %k1, %k0 +; KNL-NEXT: kshiftlw $10, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $9, %k1, %k0 +; KNL-NEXT: kshiftlw $9, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $8, %k1, %k0 +; KNL-NEXT: kshiftlw $8, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $7, %k1, %k0 +; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $6, %k1, %k0 +; KNL-NEXT: kshiftlw $6, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $5, %k1, %k0 +; KNL-NEXT: kshiftlw $5, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $4, %k1, %k0 +; KNL-NEXT: kshiftlw $4, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $3, %k1, %k0 +; KNL-NEXT: kshiftlw $3, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $2, %k1, %k0 +; KNL-NEXT: kshiftlw $2, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $1, %k1, %k0 +; KNL-NEXT: kshiftlw $1, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftrw $15, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k2, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z} +; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z} ; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0 -; KNL-NEXT: kshiftlw $14, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax -; KNL-NEXT: kshiftlw $15, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %ecx +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: vmovd %ecx, %xmm4 ; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $13, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $13, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $12, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $12, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $11, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $11, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $10, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $10, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $9, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $9, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $8, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $8, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $7, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $7, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $6, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $6, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $5, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $5, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $4, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $3, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $3, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $2, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $1, %k0, %k2 -; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $1, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k2} {z} ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 -; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z} ; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 @@ -2092,11 +2092,11 @@ ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; KNL-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 +; KNL-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 +; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2 ; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 @@ -2941,6 +2941,37 @@ ; ; KNL-LABEL: store_64i1: ; KNL: ## BB#0: +; KNL-NEXT: pushq %rbp +; KNL-NEXT: Lcfi9: +; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: pushq %r15 +; KNL-NEXT: Lcfi10: +; KNL-NEXT: .cfi_def_cfa_offset 24 +; KNL-NEXT: pushq %r14 +; KNL-NEXT: Lcfi11: +; KNL-NEXT: .cfi_def_cfa_offset 32 +; KNL-NEXT: pushq %r13 +; KNL-NEXT: Lcfi12: +; KNL-NEXT: .cfi_def_cfa_offset 40 +; KNL-NEXT: pushq %r12 +; KNL-NEXT: Lcfi13: +; KNL-NEXT: .cfi_def_cfa_offset 48 +; KNL-NEXT: pushq %rbx +; KNL-NEXT: Lcfi14: +; KNL-NEXT: .cfi_def_cfa_offset 56 +; KNL-NEXT: Lcfi15: +; KNL-NEXT: .cfi_offset %rbx, -56 +; KNL-NEXT: Lcfi16: +; KNL-NEXT: .cfi_offset %r12, -48 +; KNL-NEXT: Lcfi17: +; KNL-NEXT: .cfi_offset %r13, -40 +; KNL-NEXT: Lcfi18: +; KNL-NEXT: .cfi_offset %r14, -32 +; KNL-NEXT: Lcfi19: +; KNL-NEXT: .cfi_offset %r15, -24 +; KNL-NEXT: Lcfi20: +; KNL-NEXT: .cfi_offset %rbp, -16 +; KNL-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ## 8-byte Spill ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -2952,275 +2983,286 @@ ; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %ecx, %xmm3 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $11, %k0, %k1 -; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $10, %k0, %k1 -; KNL-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftlw $9, %k0, %k1 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $8, %k0, %k1 -; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $7, %k0, %k1 -; KNL-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $6, %k0, %k1 -; KNL-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $5, %k0, %k1 -; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftlw $4, %k0, %k1 -; KNL-NEXT: vpinsrb $8, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftlw $3, %k0, %k1 -; KNL-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $2, %k0, %k1 -; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: movl %ebx, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $1, %k0, %k1 -; KNL-NEXT: vpinsrb $11, %edx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vmovd %edx, %xmm3 ; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 -; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm2 -; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 -; KNL-NEXT: kmovw %k0, 6(%rdi) +; KNL-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm2 +; KNL-NEXT: kmovw %k0, %r11d ; KNL-NEXT: kshiftlw $14, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $2, %esi, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $15, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: kshiftlw $13, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: kshiftlw $12, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %ecx, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $5, %edi, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $11, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $6, %r9d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %r9d ; KNL-NEXT: kshiftlw $10, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $7, %ebp, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %ebp ; KNL-NEXT: kshiftlw $9, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $8, %r10d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %ebx ; KNL-NEXT: kshiftlw $8, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $9, %r13d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $10, %r12d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %r12d ; KNL-NEXT: kshiftlw $6, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $11, %r15d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $5, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $12, %r14d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: kshiftlw $4, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $3, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: kshiftlw $2, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %r11d, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %edi ; KNL-NEXT: kshiftlw $1, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vmovd %esi, %xmm3 +; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL-NEXT: kshiftrw $15, %k2, %k0 -; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm1 -; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm2, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm3, %xmm2 ## 4-byte Folded Reload +; KNL-NEXT: vpinsrb $2, %r8d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $5, %ebp, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $6, %ebx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $8, %r12d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $9, %r15d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $10, %r14d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $12, %r10d, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $13, %edi, %xmm2, %xmm2 +; KNL-NEXT: vpinsrb $14, %esi, %xmm2, %xmm2 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: movq -{{[0-9]+}}(%rsp), %r8 ## 8-byte Reload +; KNL-NEXT: kmovw %k0, 6(%r8) +; KNL-NEXT: vpmovsxbd %xmm2, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, 4(%rdi) +; KNL-NEXT: kmovw %k0, 4(%r8) ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill ; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r10d ; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %r9d ; KNL-NEXT: kshiftlw $12, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %ecx, %xmm1 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r11d ; KNL-NEXT: kshiftlw $11, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r14d ; KNL-NEXT: kshiftlw $10, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %r15d ; KNL-NEXT: kshiftlw $9, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %r12d ; KNL-NEXT: kshiftlw $8, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %r13d ; KNL-NEXT: kshiftlw $7, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $5, %edx, %xmm1, %xmm1 ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: kshiftlw $6, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $5, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: kshiftlw $4, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %edi ; KNL-NEXT: kshiftlw $3, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %ebp ; KNL-NEXT: kshiftlw $2, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k0, %ebx ; KNL-NEXT: kshiftlw $1, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $11, %edx, %xmm1, %xmm1 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: vmovd %r10d, %xmm1 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm0 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm1, %xmm0 ## 4-byte Folded Reload +; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, %ebp, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, %ebx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: kmovw %k1, 2(%r8) ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r8d ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r9d ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r10d ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %edx, %xmm1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r11d ; KNL-NEXT: kshiftlw $11, %k0, %k1 -; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r14d ; KNL-NEXT: kshiftlw $10, %k0, %k1 -; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %r15d ; KNL-NEXT: kshiftlw $9, %k0, %k1 -; KNL-NEXT: vpinsrb $3, %edx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftlw $8, %k0, %k1 -; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftlw $7, %k0, %k1 -; KNL-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $6, %k0, %k1 -; KNL-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %edx ; KNL-NEXT: kshiftlw $5, %k0, %k1 -; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %esi ; KNL-NEXT: kshiftlw $4, %k0, %k1 -; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %edi ; KNL-NEXT: kshiftlw $3, %k0, %k1 -; KNL-NEXT: vpinsrb $9, %edx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kmovw %k1, %ebp ; KNL-NEXT: kshiftlw $2, %k0, %k1 -; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k1, %ebx ; KNL-NEXT: kshiftlw $1, %k0, %k1 -; KNL-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $12, %edx, %xmm1, %xmm1 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; KNL-NEXT: vmovd %r9d, %xmm0 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, %esi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, %ebp, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vpinsrb $13, %ebx, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm0 ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k0, 2(%rdi) ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, (%rdi) +; KNL-NEXT: movq -{{[0-9]+}}(%rsp), %rax ## 8-byte Reload +; KNL-NEXT: kmovw %k0, (%rax) +; KNL-NEXT: popq %rbx +; KNL-NEXT: popq %r12 +; KNL-NEXT: popq %r13 +; KNL-NEXT: popq %r14 +; KNL-NEXT: popq %r15 +; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: store_64i1: Index: test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-vec-cmp.ll +++ test/CodeGen/X86/avx512-vec-cmp.ll @@ -325,16 +325,16 @@ ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -506,146 +506,146 @@ ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, (%rsp) -; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm0 -; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: vmovd %ecx, %xmm1 +; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: kmovw %k1, (%rsp) ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm1 -; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 -; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1938,69 +1938,69 @@ ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 ; AVX512F-32-NEXT: movb %al, %dl -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: movb %al, %dl -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: shrb $3, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 ; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax @@ -2349,23 +2349,24 @@ ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $29, %eax -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 ; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $29, %eax +; AVX512F-32-NEXT: andb $1, %al +; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2373,8 +2374,7 @@ ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -2412,15 +2412,15 @@ ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1} ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k0 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl %esi, %eax @@ -2822,69 +2822,69 @@ ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: andb $2, %dl +; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 ; AVX512F-32-NEXT: movb %al, %dl -; AVX512F-32-NEXT: andb $2, %dl -; AVX512F-32-NEXT: shrb %dl +; AVX512F-32-NEXT: andb $15, %dl +; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: movb %al, %dl -; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al -; AVX512F-32-NEXT: shrb $2, %dl -; AVX512F-32-NEXT: kmovd %edx, %k1 +; AVX512F-32-NEXT: shrb $3, %al +; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 -; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 -; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 ; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm6, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm6 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm2, %ymm7, %ymm7 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax @@ -3233,23 +3233,24 @@ ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] ; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-32-NEXT: movl %ecx, %eax -; AVX512F-32-NEXT: shrl $29, %eax -; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: kmovd %eax, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 ; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: movl %ecx, %eax +; AVX512F-32-NEXT: shrl $29, %eax +; AVX512F-32-NEXT: andb $1, %al +; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3257,8 +3258,7 @@ ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] ; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -3296,15 +3296,15 @@ ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1} ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: addl %esi, %eax Index: test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -19,83 +19,103 @@ ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .Lcfi2: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi3: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi4: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi5: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi6: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi7: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -117,90 +137,110 @@ ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi3: +; NoVLX-NEXT: .Lcfi8: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi4: +; NoVLX-NEXT: .Lcfi9: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi5: +; NoVLX-NEXT: .Lcfi10: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi11: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi12: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi13: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi14: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi15: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -224,91 +264,111 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi6: +; NoVLX-NEXT: .Lcfi16: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi7: +; NoVLX-NEXT: .Lcfi17: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi8: +; NoVLX-NEXT: .Lcfi18: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi19: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi20: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi21: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi22: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi23: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -333,91 +393,111 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi9: +; NoVLX-NEXT: .Lcfi24: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi10: +; NoVLX-NEXT: .Lcfi25: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi11: +; NoVLX-NEXT: .Lcfi26: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi27: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi28: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi29: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi30: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi31: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -443,12 +523,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi12: +; NoVLX-NEXT: .Lcfi32: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi13: +; NoVLX-NEXT: .Lcfi33: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi14: +; NoVLX-NEXT: .Lcfi34: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -457,20 +537,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi15: +; NoVLX-NEXT: .Lcfi35: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi16: +; NoVLX-NEXT: .Lcfi36: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi17: +; NoVLX-NEXT: .Lcfi37: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi18: +; NoVLX-NEXT: .Lcfi38: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi19: +; NoVLX-NEXT: .Lcfi39: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -513,11 +597,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -529,15 +613,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -573,12 +653,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi20: +; NoVLX-NEXT: .Lcfi40: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi21: +; NoVLX-NEXT: .Lcfi41: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi22: +; NoVLX-NEXT: .Lcfi42: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -587,20 +667,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi23: +; NoVLX-NEXT: .Lcfi43: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi24: +; NoVLX-NEXT: .Lcfi44: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi25: +; NoVLX-NEXT: .Lcfi45: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi26: +; NoVLX-NEXT: .Lcfi46: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi27: +; NoVLX-NEXT: .Lcfi47: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -643,11 +727,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -659,15 +743,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -705,12 +785,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi28: +; NoVLX-NEXT: .Lcfi48: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi29: +; NoVLX-NEXT: .Lcfi49: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi30: +; NoVLX-NEXT: .Lcfi50: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -719,21 +799,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi31: +; NoVLX-NEXT: .Lcfi51: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi32: +; NoVLX-NEXT: .Lcfi52: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi33: +; NoVLX-NEXT: .Lcfi53: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi34: +; NoVLX-NEXT: .Lcfi54: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi35: +; NoVLX-NEXT: .Lcfi55: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -776,11 +860,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -792,15 +876,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -839,12 +919,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi36: +; NoVLX-NEXT: .Lcfi56: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi37: +; NoVLX-NEXT: .Lcfi57: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi38: +; NoVLX-NEXT: .Lcfi58: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -853,21 +933,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi39: +; NoVLX-NEXT: .Lcfi59: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi40: +; NoVLX-NEXT: .Lcfi60: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi41: +; NoVLX-NEXT: .Lcfi61: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi42: +; NoVLX-NEXT: .Lcfi62: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi43: +; NoVLX-NEXT: .Lcfi63: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -910,11 +994,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -926,15 +1010,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -975,12 +1055,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi44: +; NoVLX-NEXT: .Lcfi64: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi45: +; NoVLX-NEXT: .Lcfi65: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi46: +; NoVLX-NEXT: .Lcfi66: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1024,12 +1104,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi47: +; NoVLX-NEXT: .Lcfi67: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi48: +; NoVLX-NEXT: .Lcfi68: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi49: +; NoVLX-NEXT: .Lcfi69: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1075,12 +1155,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi50: +; NoVLX-NEXT: .Lcfi70: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi51: +; NoVLX-NEXT: .Lcfi71: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi52: +; NoVLX-NEXT: .Lcfi72: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -1136,12 +1216,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi53: +; NoVLX-NEXT: .Lcfi73: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi54: +; NoVLX-NEXT: .Lcfi74: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi55: +; NoVLX-NEXT: .Lcfi75: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -1320,12 +1400,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi56: +; NoVLX-NEXT: .Lcfi76: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi57: +; NoVLX-NEXT: .Lcfi77: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi58: +; NoVLX-NEXT: .Lcfi78: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1333,8 +1413,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1368,6 +1446,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1395,12 +1475,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi59: +; NoVLX-NEXT: .Lcfi79: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi60: +; NoVLX-NEXT: .Lcfi80: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi61: +; NoVLX-NEXT: .Lcfi81: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1408,8 +1488,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1443,6 +1521,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1472,12 +1552,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi62: +; NoVLX-NEXT: .Lcfi82: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi63: +; NoVLX-NEXT: .Lcfi83: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi64: +; NoVLX-NEXT: .Lcfi84: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1486,8 +1566,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1521,6 +1599,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1551,12 +1631,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi65: +; NoVLX-NEXT: .Lcfi85: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi66: +; NoVLX-NEXT: .Lcfi86: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi67: +; NoVLX-NEXT: .Lcfi87: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1565,8 +1645,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -1600,6 +1678,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1631,12 +1711,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi68: +; NoVLX-NEXT: .Lcfi88: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi69: +; NoVLX-NEXT: .Lcfi89: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi70: +; NoVLX-NEXT: .Lcfi90: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1644,43 +1724,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1711,12 +1791,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi71: +; NoVLX-NEXT: .Lcfi91: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi72: +; NoVLX-NEXT: .Lcfi92: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi73: +; NoVLX-NEXT: .Lcfi93: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1724,43 +1804,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1793,12 +1873,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi74: +; NoVLX-NEXT: .Lcfi94: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi75: +; NoVLX-NEXT: .Lcfi95: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi76: +; NoVLX-NEXT: .Lcfi96: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1807,43 +1887,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1877,12 +1957,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi77: +; NoVLX-NEXT: .Lcfi97: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi78: +; NoVLX-NEXT: .Lcfi98: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi79: +; NoVLX-NEXT: .Lcfi99: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1891,43 +1971,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1963,90 +2043,110 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi80: +; NoVLX-NEXT: .Lcfi100: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi81: +; NoVLX-NEXT: .Lcfi101: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi82: +; NoVLX-NEXT: .Lcfi102: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi103: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi104: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi105: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi106: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi107: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -2069,90 +2169,110 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi83: +; NoVLX-NEXT: .Lcfi108: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi84: +; NoVLX-NEXT: .Lcfi109: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi85: +; NoVLX-NEXT: .Lcfi110: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi111: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi112: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi113: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi114: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi115: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -2177,91 +2297,111 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi86: +; NoVLX-NEXT: .Lcfi116: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi87: +; NoVLX-NEXT: .Lcfi117: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi88: +; NoVLX-NEXT: .Lcfi118: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi119: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi120: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi121: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi122: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi123: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -2287,91 +2427,111 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi89: +; NoVLX-NEXT: .Lcfi124: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi90: +; NoVLX-NEXT: .Lcfi125: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi91: +; NoVLX-NEXT: .Lcfi126: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi127: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi128: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi129: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi130: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi131: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -2398,12 +2558,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi92: +; NoVLX-NEXT: .Lcfi132: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi93: +; NoVLX-NEXT: .Lcfi133: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi94: +; NoVLX-NEXT: .Lcfi134: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2412,20 +2572,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi95: +; NoVLX-NEXT: .Lcfi135: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi96: +; NoVLX-NEXT: .Lcfi136: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi97: +; NoVLX-NEXT: .Lcfi137: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi98: +; NoVLX-NEXT: .Lcfi138: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi99: +; NoVLX-NEXT: .Lcfi139: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2468,11 +2632,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2484,15 +2648,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2529,12 +2689,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi100: +; NoVLX-NEXT: .Lcfi140: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi101: +; NoVLX-NEXT: .Lcfi141: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi102: +; NoVLX-NEXT: .Lcfi142: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2543,20 +2703,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi103: +; NoVLX-NEXT: .Lcfi143: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi104: +; NoVLX-NEXT: .Lcfi144: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi105: +; NoVLX-NEXT: .Lcfi145: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi106: +; NoVLX-NEXT: .Lcfi146: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi107: +; NoVLX-NEXT: .Lcfi147: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2599,11 +2763,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2615,15 +2779,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2662,12 +2822,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi108: +; NoVLX-NEXT: .Lcfi148: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi109: +; NoVLX-NEXT: .Lcfi149: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi110: +; NoVLX-NEXT: .Lcfi150: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2676,21 +2836,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi111: +; NoVLX-NEXT: .Lcfi151: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi112: +; NoVLX-NEXT: .Lcfi152: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi113: +; NoVLX-NEXT: .Lcfi153: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi114: +; NoVLX-NEXT: .Lcfi154: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi115: +; NoVLX-NEXT: .Lcfi155: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2733,11 +2897,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2749,15 +2913,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2797,12 +2957,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi116: +; NoVLX-NEXT: .Lcfi156: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi117: +; NoVLX-NEXT: .Lcfi157: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi118: +; NoVLX-NEXT: .Lcfi158: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2811,21 +2971,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi119: +; NoVLX-NEXT: .Lcfi159: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi120: +; NoVLX-NEXT: .Lcfi160: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi121: +; NoVLX-NEXT: .Lcfi161: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi122: +; NoVLX-NEXT: .Lcfi162: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi123: +; NoVLX-NEXT: .Lcfi163: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -2868,11 +3032,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -2884,15 +3048,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2933,12 +3093,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi124: +; NoVLX-NEXT: .Lcfi164: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi125: +; NoVLX-NEXT: .Lcfi165: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi126: +; NoVLX-NEXT: .Lcfi166: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -2963,7 +3123,7 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm3 @@ -3083,36 +3243,36 @@ ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rax ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -3176,81 +3336,81 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) @@ -3284,12 +3444,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi127: +; NoVLX-NEXT: .Lcfi167: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi128: +; NoVLX-NEXT: .Lcfi168: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi129: +; NoVLX-NEXT: .Lcfi169: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -3373,11 +3533,11 @@ ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm1 ; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -3552,12 +3712,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi130: +; NoVLX-NEXT: .Lcfi170: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi131: +; NoVLX-NEXT: .Lcfi171: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi132: +; NoVLX-NEXT: .Lcfi172: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -3708,176 +3868,176 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm1 -; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -3915,12 +4075,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi133: +; NoVLX-NEXT: .Lcfi173: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi134: +; NoVLX-NEXT: .Lcfi174: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi135: +; NoVLX-NEXT: .Lcfi175: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -3988,108 +4148,44 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm3 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 -; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm2, %ymm2 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm2, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -4138,24 +4234,88 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -4196,8 +4356,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4251,8 +4411,8 @@ ; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4306,28 +4466,28 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4384,27 +4544,27 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4464,8 +4624,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4521,28 +4681,28 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4999,12 +5159,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi136: +; NoVLX-NEXT: .Lcfi176: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi137: +; NoVLX-NEXT: .Lcfi177: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi138: +; NoVLX-NEXT: .Lcfi178: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5042,12 +5202,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi139: +; NoVLX-NEXT: .Lcfi179: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi140: +; NoVLX-NEXT: .Lcfi180: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi141: +; NoVLX-NEXT: .Lcfi181: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5087,34 +5247,34 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi142: +; NoVLX-NEXT: .Lcfi182: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi143: +; NoVLX-NEXT: .Lcfi183: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi144: +; NoVLX-NEXT: .Lcfi184: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -5152,32 +5312,32 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi145: +; NoVLX-NEXT: .Lcfi185: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi146: +; NoVLX-NEXT: .Lcfi186: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi147: +; NoVLX-NEXT: .Lcfi187: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -5218,12 +5378,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi148: +; NoVLX-NEXT: .Lcfi188: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi149: +; NoVLX-NEXT: .Lcfi189: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi150: +; NoVLX-NEXT: .Lcfi190: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5265,35 +5425,35 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi151: +; NoVLX-NEXT: .Lcfi191: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi152: +; NoVLX-NEXT: .Lcfi192: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi153: +; NoVLX-NEXT: .Lcfi193: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -5333,20 +5493,20 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi154: +; NoVLX-NEXT: .Lcfi194: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi155: +; NoVLX-NEXT: .Lcfi195: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi156: +; NoVLX-NEXT: .Lcfi196: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5382,20 +5542,20 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi157: +; NoVLX-NEXT: .Lcfi197: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi158: +; NoVLX-NEXT: .Lcfi198: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi159: +; NoVLX-NEXT: .Lcfi199: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5433,38 +5593,38 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi160: +; NoVLX-NEXT: .Lcfi200: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi161: +; NoVLX-NEXT: .Lcfi201: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi162: +; NoVLX-NEXT: .Lcfi202: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5504,38 +5664,38 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi163: +; NoVLX-NEXT: .Lcfi203: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi164: +; NoVLX-NEXT: .Lcfi204: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi165: +; NoVLX-NEXT: .Lcfi205: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5576,12 +5736,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi166: +; NoVLX-NEXT: .Lcfi206: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi167: +; NoVLX-NEXT: .Lcfi207: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi168: +; NoVLX-NEXT: .Lcfi208: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -5589,8 +5749,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5629,39 +5789,39 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi169: +; NoVLX-NEXT: .Lcfi209: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi170: +; NoVLX-NEXT: .Lcfi210: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi171: +; NoVLX-NEXT: .Lcfi211: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -5892,20 +6052,18 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi172: +; NoVLX-NEXT: .Lcfi212: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi173: +; NoVLX-NEXT: .Lcfi213: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi174: +; NoVLX-NEXT: .Lcfi214: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -5939,6 +6097,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -5967,20 +6127,18 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi175: +; NoVLX-NEXT: .Lcfi215: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi176: +; NoVLX-NEXT: .Lcfi216: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi177: +; NoVLX-NEXT: .Lcfi217: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -6014,6 +6172,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6044,12 +6204,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi178: +; NoVLX-NEXT: .Lcfi218: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi179: +; NoVLX-NEXT: .Lcfi219: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi180: +; NoVLX-NEXT: .Lcfi220: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6058,8 +6218,6 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -6093,6 +6251,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6124,12 +6284,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi181: +; NoVLX-NEXT: .Lcfi221: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi182: +; NoVLX-NEXT: .Lcfi222: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi183: +; NoVLX-NEXT: .Lcfi223: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6138,8 +6298,6 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -6173,6 +6331,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6205,20 +6365,182 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi184: +; NoVLX-NEXT: .Lcfi224: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi185: +; NoVLX-NEXT: .Lcfi225: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi186: +; NoVLX-NEXT: .Lcfi226: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi227: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi228: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi229: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp eq <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi230: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi231: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi232: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -6246,9 +6568,9 @@ ; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 @@ -6256,49 +6578,49 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> - %load = load i32, i32* %__b - %vec = insertelement <8 x i32> undef, i32 %load, i32 0 - %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> %2 = icmp eq <8 x i32> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> - %4 = bitcast <32 x i1> %3 to i32 - ret i32 %4 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 } -define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: ; VLX: # BB#0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} -; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; -; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi187: +; NoVLX-NEXT: .Lcfi233: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi188: +; NoVLX-NEXT: .Lcfi234: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi189: +; NoVLX-NEXT: .Lcfi235: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: kandw %k0, %k1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -6336,168 +6658,6 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <8 x i32> - %load = load i32, i32* %__b - %vec = insertelement <8 x i32> undef, i32 %load, i32 0 - %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> - %2 = icmp eq <8 x i32> %0, %1 - %3 = bitcast i8 %__u to <8 x i1> - %4 = and <8 x i1> %3, %2 - %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> - %6 = bitcast <32 x i1> %5 to i32 - ret i32 %6 -} - - -define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi190: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi191: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi192: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <8 x i32> - %1 = bitcast <4 x i64> %__b to <8 x i32> - %2 = icmp eq <8 x i32> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 -} - -define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi193: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi194: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi195: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 -; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax @@ -6527,12 +6687,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi196: +; NoVLX-NEXT: .Lcfi236: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi197: +; NoVLX-NEXT: .Lcfi237: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi198: +; NoVLX-NEXT: .Lcfi238: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6541,43 +6701,43 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6612,12 +6772,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi199: +; NoVLX-NEXT: .Lcfi239: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi200: +; NoVLX-NEXT: .Lcfi240: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi201: +; NoVLX-NEXT: .Lcfi241: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6626,43 +6786,43 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6698,55 +6858,55 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi202: +; NoVLX-NEXT: .Lcfi242: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi203: +; NoVLX-NEXT: .Lcfi243: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi204: +; NoVLX-NEXT: .Lcfi244: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6781,12 +6941,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi205: +; NoVLX-NEXT: .Lcfi245: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi206: +; NoVLX-NEXT: .Lcfi246: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi207: +; NoVLX-NEXT: .Lcfi247: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6795,43 +6955,43 @@ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -6868,87 +7028,107 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi208: +; NoVLX-NEXT: .Lcfi248: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi209: +; NoVLX-NEXT: .Lcfi249: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi210: +; NoVLX-NEXT: .Lcfi250: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi251: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi252: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi253: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi254: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi255: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -6971,87 +7151,107 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi211: +; NoVLX-NEXT: .Lcfi256: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi212: +; NoVLX-NEXT: .Lcfi257: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi213: +; NoVLX-NEXT: .Lcfi258: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi259: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi260: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi261: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi262: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi263: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -7076,88 +7276,108 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi214: +; NoVLX-NEXT: .Lcfi264: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi215: +; NoVLX-NEXT: .Lcfi265: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi216: +; NoVLX-NEXT: .Lcfi266: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi267: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi268: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi269: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi270: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi271: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -7183,88 +7403,108 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi217: +; NoVLX-NEXT: .Lcfi272: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi218: +; NoVLX-NEXT: .Lcfi273: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi219: +; NoVLX-NEXT: .Lcfi274: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi275: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi276: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi277: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi278: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi279: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -7291,87 +7531,107 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi220: +; NoVLX-NEXT: .Lcfi280: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi221: +; NoVLX-NEXT: .Lcfi281: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi222: +; NoVLX-NEXT: .Lcfi282: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi283: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi284: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi285: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi286: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi287: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -7397,88 +7657,108 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi223: +; NoVLX-NEXT: .Lcfi288: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi224: +; NoVLX-NEXT: .Lcfi289: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi225: +; NoVLX-NEXT: .Lcfi290: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi291: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi292: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi293: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi294: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi295: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -7506,12 +7786,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi226: +; NoVLX-NEXT: .Lcfi296: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi227: +; NoVLX-NEXT: .Lcfi297: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi228: +; NoVLX-NEXT: .Lcfi298: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7520,17 +7800,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi229: +; NoVLX-NEXT: .Lcfi299: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi230: +; NoVLX-NEXT: .Lcfi300: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi231: +; NoVLX-NEXT: .Lcfi301: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi232: +; NoVLX-NEXT: .Lcfi302: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi233: +; NoVLX-NEXT: .Lcfi303: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7573,11 +7857,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7589,15 +7873,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -7634,12 +7914,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi234: +; NoVLX-NEXT: .Lcfi304: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi235: +; NoVLX-NEXT: .Lcfi305: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi236: +; NoVLX-NEXT: .Lcfi306: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7648,17 +7928,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi237: +; NoVLX-NEXT: .Lcfi307: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi238: +; NoVLX-NEXT: .Lcfi308: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi239: +; NoVLX-NEXT: .Lcfi309: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi240: +; NoVLX-NEXT: .Lcfi310: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi241: +; NoVLX-NEXT: .Lcfi311: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7701,11 +7985,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7717,15 +8001,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -7764,12 +8044,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi242: +; NoVLX-NEXT: .Lcfi312: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi243: +; NoVLX-NEXT: .Lcfi313: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi244: +; NoVLX-NEXT: .Lcfi314: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7778,18 +8058,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi245: +; NoVLX-NEXT: .Lcfi315: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi246: +; NoVLX-NEXT: .Lcfi316: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi247: +; NoVLX-NEXT: .Lcfi317: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi248: +; NoVLX-NEXT: .Lcfi318: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi249: +; NoVLX-NEXT: .Lcfi319: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7832,11 +8116,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7848,15 +8132,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -7896,12 +8176,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi250: +; NoVLX-NEXT: .Lcfi320: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi251: +; NoVLX-NEXT: .Lcfi321: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi252: +; NoVLX-NEXT: .Lcfi322: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7910,18 +8190,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi253: +; NoVLX-NEXT: .Lcfi323: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi254: +; NoVLX-NEXT: .Lcfi324: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi255: +; NoVLX-NEXT: .Lcfi325: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi256: +; NoVLX-NEXT: .Lcfi326: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi257: +; NoVLX-NEXT: .Lcfi327: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -7964,11 +8248,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -7980,15 +8264,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8029,12 +8309,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi258: +; NoVLX-NEXT: .Lcfi328: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi259: +; NoVLX-NEXT: .Lcfi329: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi260: +; NoVLX-NEXT: .Lcfi330: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -8043,17 +8323,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi261: +; NoVLX-NEXT: .Lcfi331: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi262: +; NoVLX-NEXT: .Lcfi332: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi263: +; NoVLX-NEXT: .Lcfi333: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi264: +; NoVLX-NEXT: .Lcfi334: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi265: +; NoVLX-NEXT: .Lcfi335: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -8096,11 +8380,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -8112,15 +8396,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8160,12 +8440,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi266: +; NoVLX-NEXT: .Lcfi336: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi267: +; NoVLX-NEXT: .Lcfi337: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi268: +; NoVLX-NEXT: .Lcfi338: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -8174,18 +8454,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi269: +; NoVLX-NEXT: .Lcfi339: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi270: +; NoVLX-NEXT: .Lcfi340: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi271: +; NoVLX-NEXT: .Lcfi341: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi272: +; NoVLX-NEXT: .Lcfi342: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi273: +; NoVLX-NEXT: .Lcfi343: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -8228,11 +8512,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -8244,15 +8528,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8587,7 +8867,6 @@ ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: ; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -8595,9 +8874,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -8738,7 +9018,6 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -8746,9 +9025,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -9073,12 +9353,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi274: +; NoVLX-NEXT: .Lcfi344: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi275: +; NoVLX-NEXT: .Lcfi345: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi276: +; NoVLX-NEXT: .Lcfi346: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9116,12 +9396,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi277: +; NoVLX-NEXT: .Lcfi347: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi278: +; NoVLX-NEXT: .Lcfi348: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi279: +; NoVLX-NEXT: .Lcfi349: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9161,15 +9441,16 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi280: +; NoVLX-NEXT: .Lcfi350: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi281: +; NoVLX-NEXT: .Lcfi351: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi282: +; NoVLX-NEXT: .Lcfi352: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9177,10 +9458,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -9218,12 +9498,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi283: +; NoVLX-NEXT: .Lcfi353: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi284: +; NoVLX-NEXT: .Lcfi354: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi285: +; NoVLX-NEXT: .Lcfi355: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9276,12 +9556,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi286: +; NoVLX-NEXT: .Lcfi356: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi287: +; NoVLX-NEXT: .Lcfi357: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi288: +; NoVLX-NEXT: .Lcfi358: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9323,16 +9603,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi289: +; NoVLX-NEXT: .Lcfi359: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi290: +; NoVLX-NEXT: .Lcfi360: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi291: +; NoVLX-NEXT: .Lcfi361: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9340,10 +9621,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -9383,20 +9663,20 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi292: +; NoVLX-NEXT: .Lcfi362: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi293: +; NoVLX-NEXT: .Lcfi363: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi294: +; NoVLX-NEXT: .Lcfi364: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9432,20 +9712,20 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi295: +; NoVLX-NEXT: .Lcfi365: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi296: +; NoVLX-NEXT: .Lcfi366: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi297: +; NoVLX-NEXT: .Lcfi367: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9483,12 +9763,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi298: +; NoVLX-NEXT: .Lcfi368: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi299: +; NoVLX-NEXT: .Lcfi369: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi300: +; NoVLX-NEXT: .Lcfi370: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9505,8 +9785,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9546,12 +9826,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi301: +; NoVLX-NEXT: .Lcfi371: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi302: +; NoVLX-NEXT: .Lcfi372: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi303: +; NoVLX-NEXT: .Lcfi373: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9568,8 +9848,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9610,12 +9890,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi304: +; NoVLX-NEXT: .Lcfi374: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi305: +; NoVLX-NEXT: .Lcfi375: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi306: +; NoVLX-NEXT: .Lcfi376: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9623,8 +9903,8 @@ ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9663,12 +9943,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi307: +; NoVLX-NEXT: .Lcfi377: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi308: +; NoVLX-NEXT: .Lcfi378: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi309: +; NoVLX-NEXT: .Lcfi379: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9686,8 +9966,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -9734,8 +10014,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -9791,85 +10071,85 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp eq <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k0 -; NoVLX-NEXT: kshiftlw $7, %k0, %k0 -; NoVLX-NEXT: kshiftrw $7, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kshiftrw $1, %k1, %k1 -; NoVLX-NEXT: kshiftlw $1, %k1, %k1 -; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $12, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 -; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: # kill: %AL %AL %EAX -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <4 x i64> - %load = load <4 x i64>, <4 x i64>* %__b - %1 = bitcast <4 x i64> %load to <4 x i64> - %2 = icmp eq <4 x i64> %0, %1 - %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> - %4 = bitcast <8 x i1> %3 to i8 - ret i8 %4 -} - -define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: -; VLX: # BB#0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; VLX-NEXT: kmovd %k0, %eax -; VLX-NEXT: # kill: %AL %AL %EAX -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 -; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 -; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpextrb $4, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -9927,28 +10207,28 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10010,8 +10290,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10069,28 +10349,28 @@ ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10560,12 +10840,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi310: +; NoVLX-NEXT: .Lcfi380: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi311: +; NoVLX-NEXT: .Lcfi381: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi312: +; NoVLX-NEXT: .Lcfi382: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10605,12 +10885,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi313: +; NoVLX-NEXT: .Lcfi383: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi314: +; NoVLX-NEXT: .Lcfi384: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi315: +; NoVLX-NEXT: .Lcfi385: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10652,34 +10932,34 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi316: +; NoVLX-NEXT: .Lcfi386: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi317: +; NoVLX-NEXT: .Lcfi387: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi318: +; NoVLX-NEXT: .Lcfi388: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10719,34 +10999,34 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi319: +; NoVLX-NEXT: .Lcfi389: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi320: +; NoVLX-NEXT: .Lcfi390: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi321: +; NoVLX-NEXT: .Lcfi391: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10787,12 +11067,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi322: +; NoVLX-NEXT: .Lcfi392: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi323: +; NoVLX-NEXT: .Lcfi393: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi324: +; NoVLX-NEXT: .Lcfi394: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10836,35 +11116,35 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi325: +; NoVLX-NEXT: .Lcfi395: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi326: +; NoVLX-NEXT: .Lcfi396: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi327: +; NoVLX-NEXT: .Lcfi397: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -10906,12 +11186,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi328: +; NoVLX-NEXT: .Lcfi398: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi329: +; NoVLX-NEXT: .Lcfi399: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi330: +; NoVLX-NEXT: .Lcfi400: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -10919,8 +11199,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -10957,12 +11237,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi331: +; NoVLX-NEXT: .Lcfi401: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi332: +; NoVLX-NEXT: .Lcfi402: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi333: +; NoVLX-NEXT: .Lcfi403: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -10970,8 +11250,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11010,39 +11290,39 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi334: +; NoVLX-NEXT: .Lcfi404: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi335: +; NoVLX-NEXT: .Lcfi405: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi336: +; NoVLX-NEXT: .Lcfi406: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11083,39 +11363,39 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi337: +; NoVLX-NEXT: .Lcfi407: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi338: +; NoVLX-NEXT: .Lcfi408: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi339: +; NoVLX-NEXT: .Lcfi409: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11157,12 +11437,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi340: +; NoVLX-NEXT: .Lcfi410: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi341: +; NoVLX-NEXT: .Lcfi411: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi342: +; NoVLX-NEXT: .Lcfi412: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11171,8 +11451,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11212,40 +11492,40 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi343: +; NoVLX-NEXT: .Lcfi413: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi344: +; NoVLX-NEXT: .Lcfi414: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi345: +; NoVLX-NEXT: .Lcfi415: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -11452,18 +11732,16 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi346: +; NoVLX-NEXT: .Lcfi416: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi347: +; NoVLX-NEXT: .Lcfi417: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi348: +; NoVLX-NEXT: .Lcfi418: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -11497,6 +11775,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11525,18 +11805,16 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi349: +; NoVLX-NEXT: .Lcfi419: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi350: +; NoVLX-NEXT: .Lcfi420: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi351: +; NoVLX-NEXT: .Lcfi421: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -11570,6 +11848,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11600,19 +11880,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi352: +; NoVLX-NEXT: .Lcfi422: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi353: +; NoVLX-NEXT: .Lcfi423: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi354: +; NoVLX-NEXT: .Lcfi424: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -11646,6 +11924,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11677,19 +11957,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi355: +; NoVLX-NEXT: .Lcfi425: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi356: +; NoVLX-NEXT: .Lcfi426: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi357: +; NoVLX-NEXT: .Lcfi427: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -11723,6 +12001,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11755,18 +12035,16 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi358: +; NoVLX-NEXT: .Lcfi428: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi359: +; NoVLX-NEXT: .Lcfi429: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi360: +; NoVLX-NEXT: .Lcfi430: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -11800,6 +12078,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11831,19 +12111,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi361: +; NoVLX-NEXT: .Lcfi431: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi362: +; NoVLX-NEXT: .Lcfi432: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi363: +; NoVLX-NEXT: .Lcfi433: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -11877,6 +12155,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11910,53 +12190,53 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi364: +; NoVLX-NEXT: .Lcfi434: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi365: +; NoVLX-NEXT: .Lcfi435: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi366: +; NoVLX-NEXT: .Lcfi436: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -11988,53 +12268,53 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi367: +; NoVLX-NEXT: .Lcfi437: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi368: +; NoVLX-NEXT: .Lcfi438: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi369: +; NoVLX-NEXT: .Lcfi439: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12068,54 +12348,54 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi370: +; NoVLX-NEXT: .Lcfi440: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi371: +; NoVLX-NEXT: .Lcfi441: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi372: +; NoVLX-NEXT: .Lcfi442: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12150,54 +12430,54 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi373: +; NoVLX-NEXT: .Lcfi443: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi374: +; NoVLX-NEXT: .Lcfi444: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi375: +; NoVLX-NEXT: .Lcfi445: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12233,53 +12513,53 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi376: +; NoVLX-NEXT: .Lcfi446: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi377: +; NoVLX-NEXT: .Lcfi447: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi378: +; NoVLX-NEXT: .Lcfi448: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12314,54 +12594,54 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi379: +; NoVLX-NEXT: .Lcfi449: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi380: +; NoVLX-NEXT: .Lcfi450: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi381: +; NoVLX-NEXT: .Lcfi451: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12397,90 +12677,110 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi382: +; NoVLX-NEXT: .Lcfi452: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi383: +; NoVLX-NEXT: .Lcfi453: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi384: +; NoVLX-NEXT: .Lcfi454: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi455: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi456: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi457: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi458: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi459: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -12502,90 +12802,110 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi385: +; NoVLX-NEXT: .Lcfi460: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi386: +; NoVLX-NEXT: .Lcfi461: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi387: +; NoVLX-NEXT: .Lcfi462: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi463: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi464: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi465: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi466: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi467: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -12609,91 +12929,111 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi388: +; NoVLX-NEXT: .Lcfi468: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi389: +; NoVLX-NEXT: .Lcfi469: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi390: +; NoVLX-NEXT: .Lcfi470: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi471: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi472: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi473: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi474: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi475: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -12718,91 +13058,111 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi391: +; NoVLX-NEXT: .Lcfi476: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi392: +; NoVLX-NEXT: .Lcfi477: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi393: +; NoVLX-NEXT: .Lcfi478: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi479: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi480: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi481: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi482: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi483: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -12828,12 +13188,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi394: +; NoVLX-NEXT: .Lcfi484: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi395: +; NoVLX-NEXT: .Lcfi485: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi396: +; NoVLX-NEXT: .Lcfi486: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -12842,20 +13202,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi397: +; NoVLX-NEXT: .Lcfi487: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi398: +; NoVLX-NEXT: .Lcfi488: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi399: +; NoVLX-NEXT: .Lcfi489: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi400: +; NoVLX-NEXT: .Lcfi490: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi401: +; NoVLX-NEXT: .Lcfi491: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -12898,11 +13262,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -12914,15 +13278,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -12958,12 +13318,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi402: +; NoVLX-NEXT: .Lcfi492: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi403: +; NoVLX-NEXT: .Lcfi493: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi404: +; NoVLX-NEXT: .Lcfi494: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -12972,20 +13332,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi405: +; NoVLX-NEXT: .Lcfi495: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi406: +; NoVLX-NEXT: .Lcfi496: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi407: +; NoVLX-NEXT: .Lcfi497: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi408: +; NoVLX-NEXT: .Lcfi498: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi409: +; NoVLX-NEXT: .Lcfi499: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13028,11 +13392,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -13044,15 +13408,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13090,12 +13450,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi410: +; NoVLX-NEXT: .Lcfi500: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi411: +; NoVLX-NEXT: .Lcfi501: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi412: +; NoVLX-NEXT: .Lcfi502: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -13104,21 +13464,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi413: +; NoVLX-NEXT: .Lcfi503: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi414: +; NoVLX-NEXT: .Lcfi504: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi415: +; NoVLX-NEXT: .Lcfi505: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi416: +; NoVLX-NEXT: .Lcfi506: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi417: +; NoVLX-NEXT: .Lcfi507: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13161,11 +13525,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -13177,15 +13541,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13224,12 +13584,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi418: +; NoVLX-NEXT: .Lcfi508: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi419: +; NoVLX-NEXT: .Lcfi509: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi420: +; NoVLX-NEXT: .Lcfi510: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -13238,21 +13598,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi421: +; NoVLX-NEXT: .Lcfi511: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi422: +; NoVLX-NEXT: .Lcfi512: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi423: +; NoVLX-NEXT: .Lcfi513: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi424: +; NoVLX-NEXT: .Lcfi514: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi425: +; NoVLX-NEXT: .Lcfi515: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13295,11 +13659,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -13311,15 +13675,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13360,12 +13720,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi426: +; NoVLX-NEXT: .Lcfi516: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi427: +; NoVLX-NEXT: .Lcfi517: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi428: +; NoVLX-NEXT: .Lcfi518: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -13409,12 +13769,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi429: +; NoVLX-NEXT: .Lcfi519: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi430: +; NoVLX-NEXT: .Lcfi520: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi431: +; NoVLX-NEXT: .Lcfi521: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -13460,12 +13820,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi432: +; NoVLX-NEXT: .Lcfi522: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi433: +; NoVLX-NEXT: .Lcfi523: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi434: +; NoVLX-NEXT: .Lcfi524: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -13521,12 +13881,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi435: +; NoVLX-NEXT: .Lcfi525: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi436: +; NoVLX-NEXT: .Lcfi526: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi437: +; NoVLX-NEXT: .Lcfi527: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -13705,12 +14065,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi438: +; NoVLX-NEXT: .Lcfi528: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi439: +; NoVLX-NEXT: .Lcfi529: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi440: +; NoVLX-NEXT: .Lcfi530: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13718,8 +14078,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13753,6 +14111,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13780,12 +14140,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi441: +; NoVLX-NEXT: .Lcfi531: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi442: +; NoVLX-NEXT: .Lcfi532: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi443: +; NoVLX-NEXT: .Lcfi533: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13793,8 +14153,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13828,6 +14186,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13857,12 +14217,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi444: +; NoVLX-NEXT: .Lcfi534: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi445: +; NoVLX-NEXT: .Lcfi535: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi446: +; NoVLX-NEXT: .Lcfi536: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13871,8 +14231,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13906,6 +14264,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -13936,12 +14296,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi447: +; NoVLX-NEXT: .Lcfi537: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi448: +; NoVLX-NEXT: .Lcfi538: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi449: +; NoVLX-NEXT: .Lcfi539: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -13950,8 +14310,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -13985,6 +14343,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14016,12 +14376,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi450: +; NoVLX-NEXT: .Lcfi540: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi451: +; NoVLX-NEXT: .Lcfi541: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi452: +; NoVLX-NEXT: .Lcfi542: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14029,43 +14389,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14096,12 +14456,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi453: +; NoVLX-NEXT: .Lcfi543: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi454: +; NoVLX-NEXT: .Lcfi544: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi455: +; NoVLX-NEXT: .Lcfi545: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14109,43 +14469,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14178,12 +14538,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi456: +; NoVLX-NEXT: .Lcfi546: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi457: +; NoVLX-NEXT: .Lcfi547: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi458: +; NoVLX-NEXT: .Lcfi548: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14192,43 +14552,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14262,12 +14622,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi459: +; NoVLX-NEXT: .Lcfi549: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi460: +; NoVLX-NEXT: .Lcfi550: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi461: +; NoVLX-NEXT: .Lcfi551: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14276,43 +14636,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14348,90 +14708,110 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi462: +; NoVLX-NEXT: .Lcfi552: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi463: +; NoVLX-NEXT: .Lcfi553: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi464: +; NoVLX-NEXT: .Lcfi554: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi555: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi556: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi557: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi558: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi559: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -14454,90 +14834,110 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi465: +; NoVLX-NEXT: .Lcfi560: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi466: +; NoVLX-NEXT: .Lcfi561: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi467: +; NoVLX-NEXT: .Lcfi562: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi563: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi564: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi565: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi566: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi567: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -14562,91 +14962,111 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi468: +; NoVLX-NEXT: .Lcfi568: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi469: +; NoVLX-NEXT: .Lcfi569: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi470: +; NoVLX-NEXT: .Lcfi570: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi571: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi572: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi573: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi574: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi575: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -14672,91 +15092,111 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi471: +; NoVLX-NEXT: .Lcfi576: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi472: +; NoVLX-NEXT: .Lcfi577: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi473: +; NoVLX-NEXT: .Lcfi578: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi579: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi580: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi581: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi582: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi583: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -14783,12 +15223,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi474: +; NoVLX-NEXT: .Lcfi584: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi475: +; NoVLX-NEXT: .Lcfi585: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi476: +; NoVLX-NEXT: .Lcfi586: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -14797,20 +15237,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi477: +; NoVLX-NEXT: .Lcfi587: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi478: +; NoVLX-NEXT: .Lcfi588: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi479: +; NoVLX-NEXT: .Lcfi589: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi480: +; NoVLX-NEXT: .Lcfi590: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi481: +; NoVLX-NEXT: .Lcfi591: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -14853,11 +15297,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -14869,15 +15313,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -14914,12 +15354,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi482: +; NoVLX-NEXT: .Lcfi592: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi483: +; NoVLX-NEXT: .Lcfi593: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi484: +; NoVLX-NEXT: .Lcfi594: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -14928,20 +15368,24 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi485: +; NoVLX-NEXT: .Lcfi595: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi486: +; NoVLX-NEXT: .Lcfi596: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi487: +; NoVLX-NEXT: .Lcfi597: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi488: +; NoVLX-NEXT: .Lcfi598: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi489: +; NoVLX-NEXT: .Lcfi599: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -14984,11 +15428,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -15000,15 +15444,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -15047,12 +15487,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi490: +; NoVLX-NEXT: .Lcfi600: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi491: +; NoVLX-NEXT: .Lcfi601: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi492: +; NoVLX-NEXT: .Lcfi602: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -15061,21 +15501,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi493: +; NoVLX-NEXT: .Lcfi603: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi494: +; NoVLX-NEXT: .Lcfi604: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi495: +; NoVLX-NEXT: .Lcfi605: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi496: +; NoVLX-NEXT: .Lcfi606: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi497: +; NoVLX-NEXT: .Lcfi607: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -15118,11 +15562,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -15134,15 +15578,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -15182,12 +15622,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi498: +; NoVLX-NEXT: .Lcfi608: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi499: +; NoVLX-NEXT: .Lcfi609: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi500: +; NoVLX-NEXT: .Lcfi610: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -15196,21 +15636,25 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi501: +; NoVLX-NEXT: .Lcfi611: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi502: +; NoVLX-NEXT: .Lcfi612: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi503: +; NoVLX-NEXT: .Lcfi613: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi504: +; NoVLX-NEXT: .Lcfi614: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi505: +; NoVLX-NEXT: .Lcfi615: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -15253,11 +15697,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -15269,15 +15713,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -15318,12 +15758,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi506: +; NoVLX-NEXT: .Lcfi616: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi507: +; NoVLX-NEXT: .Lcfi617: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi508: +; NoVLX-NEXT: .Lcfi618: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -15348,7 +15788,7 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm3 @@ -15468,36 +15908,36 @@ ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rax ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -15561,81 +16001,81 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) @@ -15669,12 +16109,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi509: +; NoVLX-NEXT: .Lcfi619: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi510: +; NoVLX-NEXT: .Lcfi620: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi511: +; NoVLX-NEXT: .Lcfi621: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -15758,11 +16198,11 @@ ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm1 ; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -15937,12 +16377,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi512: +; NoVLX-NEXT: .Lcfi622: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi513: +; NoVLX-NEXT: .Lcfi623: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi514: +; NoVLX-NEXT: .Lcfi624: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -16093,176 +16533,176 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm1, %ymm2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -16300,12 +16740,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi515: +; NoVLX-NEXT: .Lcfi625: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi516: +; NoVLX-NEXT: .Lcfi626: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi517: +; NoVLX-NEXT: .Lcfi627: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -16373,174 +16813,174 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 -; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -16581,8 +17021,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -16636,8 +17076,8 @@ ; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -16691,28 +17131,28 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -16769,27 +17209,27 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -16849,8 +17289,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -16906,28 +17346,28 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -17384,12 +17824,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi518: +; NoVLX-NEXT: .Lcfi628: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi519: +; NoVLX-NEXT: .Lcfi629: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi520: +; NoVLX-NEXT: .Lcfi630: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17427,12 +17867,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi521: +; NoVLX-NEXT: .Lcfi631: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi522: +; NoVLX-NEXT: .Lcfi632: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi523: +; NoVLX-NEXT: .Lcfi633: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17472,34 +17912,34 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi524: +; NoVLX-NEXT: .Lcfi634: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi525: +; NoVLX-NEXT: .Lcfi635: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi526: +; NoVLX-NEXT: .Lcfi636: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -17537,32 +17977,32 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi527: +; NoVLX-NEXT: .Lcfi637: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi528: +; NoVLX-NEXT: .Lcfi638: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi529: +; NoVLX-NEXT: .Lcfi639: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -17603,12 +18043,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi530: +; NoVLX-NEXT: .Lcfi640: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi531: +; NoVLX-NEXT: .Lcfi641: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi532: +; NoVLX-NEXT: .Lcfi642: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17650,35 +18090,35 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi533: +; NoVLX-NEXT: .Lcfi643: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi534: +; NoVLX-NEXT: .Lcfi644: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi535: +; NoVLX-NEXT: .Lcfi645: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -17718,20 +18158,20 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi536: +; NoVLX-NEXT: .Lcfi646: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi537: +; NoVLX-NEXT: .Lcfi647: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi538: +; NoVLX-NEXT: .Lcfi648: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17767,20 +18207,20 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi539: +; NoVLX-NEXT: .Lcfi649: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi540: +; NoVLX-NEXT: .Lcfi650: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi541: +; NoVLX-NEXT: .Lcfi651: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17818,38 +18258,38 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi542: +; NoVLX-NEXT: .Lcfi652: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi543: +; NoVLX-NEXT: .Lcfi653: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi544: +; NoVLX-NEXT: .Lcfi654: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17889,38 +18329,38 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi545: +; NoVLX-NEXT: .Lcfi655: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi546: +; NoVLX-NEXT: .Lcfi656: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi547: +; NoVLX-NEXT: .Lcfi657: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -17961,12 +18401,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi548: +; NoVLX-NEXT: .Lcfi658: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi549: +; NoVLX-NEXT: .Lcfi659: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi550: +; NoVLX-NEXT: .Lcfi660: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -17974,8 +18414,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -18014,39 +18454,39 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi551: +; NoVLX-NEXT: .Lcfi661: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi552: +; NoVLX-NEXT: .Lcfi662: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi553: +; NoVLX-NEXT: .Lcfi663: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -18277,20 +18717,18 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi554: +; NoVLX-NEXT: .Lcfi664: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi555: +; NoVLX-NEXT: .Lcfi665: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi556: +; NoVLX-NEXT: .Lcfi666: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -18324,6 +18762,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18352,20 +18792,18 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi557: +; NoVLX-NEXT: .Lcfi667: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi558: +; NoVLX-NEXT: .Lcfi668: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi559: +; NoVLX-NEXT: .Lcfi669: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -18399,6 +18837,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18429,12 +18869,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi560: +; NoVLX-NEXT: .Lcfi670: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi561: +; NoVLX-NEXT: .Lcfi671: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi562: +; NoVLX-NEXT: .Lcfi672: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18443,8 +18883,6 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -18478,6 +18916,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18509,12 +18949,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi563: +; NoVLX-NEXT: .Lcfi673: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi564: +; NoVLX-NEXT: .Lcfi674: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi565: +; NoVLX-NEXT: .Lcfi675: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18523,8 +18963,6 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -18558,6 +18996,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18590,20 +19030,182 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi566: +; NoVLX-NEXT: .Lcfi676: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi567: +; NoVLX-NEXT: .Lcfi677: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi568: +; NoVLX-NEXT: .Lcfi678: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi679: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi680: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi681: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sgt <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi682: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi683: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi684: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -18631,9 +19233,9 @@ ; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 @@ -18641,49 +19243,49 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> - %load = load i32, i32* %__b - %vec = insertelement <8 x i32> undef, i32 %load, i32 0 - %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> %2 = icmp sgt <8 x i32> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> - %4 = bitcast <32 x i1> %3 to i32 - ret i32 %4 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 } -define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: ; VLX: # BB#0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} -; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; -; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi569: +; NoVLX-NEXT: .Lcfi685: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi570: +; NoVLX-NEXT: .Lcfi686: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi571: +; NoVLX-NEXT: .Lcfi687: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: kandw %k0, %k1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -18721,168 +19323,6 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <8 x i32> - %load = load i32, i32* %__b - %vec = insertelement <8 x i32> undef, i32 %load, i32 0 - %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> - %2 = icmp sgt <8 x i32> %0, %1 - %3 = bitcast i8 %__u to <8 x i1> - %4 = and <8 x i1> %3, %2 - %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> - %6 = bitcast <32 x i1> %5 to i32 - ret i32 %6 -} - - -define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi572: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi573: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi574: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <8 x i32> - %1 = bitcast <4 x i64> %__b to <8 x i32> - %2 = icmp sgt <8 x i32> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 -} - -define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi575: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi576: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi577: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 -; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax @@ -18912,12 +19352,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi578: +; NoVLX-NEXT: .Lcfi688: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi579: +; NoVLX-NEXT: .Lcfi689: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi580: +; NoVLX-NEXT: .Lcfi690: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -18926,43 +19366,43 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -18997,12 +19437,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi581: +; NoVLX-NEXT: .Lcfi691: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi582: +; NoVLX-NEXT: .Lcfi692: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi583: +; NoVLX-NEXT: .Lcfi693: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19011,43 +19451,43 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -19083,55 +19523,55 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi584: +; NoVLX-NEXT: .Lcfi694: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi585: +; NoVLX-NEXT: .Lcfi695: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi586: +; NoVLX-NEXT: .Lcfi696: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -19166,12 +19606,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi587: +; NoVLX-NEXT: .Lcfi697: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi588: +; NoVLX-NEXT: .Lcfi698: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi589: +; NoVLX-NEXT: .Lcfi699: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19180,43 +19620,43 @@ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -19253,87 +19693,107 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi590: +; NoVLX-NEXT: .Lcfi700: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi591: +; NoVLX-NEXT: .Lcfi701: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi592: +; NoVLX-NEXT: .Lcfi702: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi703: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi704: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi705: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi706: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi707: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -19356,87 +19816,107 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi593: +; NoVLX-NEXT: .Lcfi708: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi594: +; NoVLX-NEXT: .Lcfi709: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi595: +; NoVLX-NEXT: .Lcfi710: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi711: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi712: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi713: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi714: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi715: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -19461,88 +19941,108 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi596: +; NoVLX-NEXT: .Lcfi716: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi597: +; NoVLX-NEXT: .Lcfi717: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi598: +; NoVLX-NEXT: .Lcfi718: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi719: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi720: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi721: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi722: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi723: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -19568,88 +20068,108 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi599: +; NoVLX-NEXT: .Lcfi724: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi600: +; NoVLX-NEXT: .Lcfi725: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi601: +; NoVLX-NEXT: .Lcfi726: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi727: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi728: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi729: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi730: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi731: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -19676,87 +20196,107 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi602: +; NoVLX-NEXT: .Lcfi732: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi603: +; NoVLX-NEXT: .Lcfi733: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi604: +; NoVLX-NEXT: .Lcfi734: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi735: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi736: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi737: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi738: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi739: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -19782,88 +20322,108 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi605: +; NoVLX-NEXT: .Lcfi740: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi606: +; NoVLX-NEXT: .Lcfi741: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi607: +; NoVLX-NEXT: .Lcfi742: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi743: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi744: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi745: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi746: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi747: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -19891,12 +20451,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi608: +; NoVLX-NEXT: .Lcfi748: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi609: +; NoVLX-NEXT: .Lcfi749: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi610: +; NoVLX-NEXT: .Lcfi750: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -19905,17 +20465,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi611: +; NoVLX-NEXT: .Lcfi751: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi612: +; NoVLX-NEXT: .Lcfi752: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi613: +; NoVLX-NEXT: .Lcfi753: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi614: +; NoVLX-NEXT: .Lcfi754: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi615: +; NoVLX-NEXT: .Lcfi755: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -19958,11 +20522,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -19974,15 +20538,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20019,12 +20579,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi616: +; NoVLX-NEXT: .Lcfi756: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi617: +; NoVLX-NEXT: .Lcfi757: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi618: +; NoVLX-NEXT: .Lcfi758: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20033,17 +20593,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi619: +; NoVLX-NEXT: .Lcfi759: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi620: +; NoVLX-NEXT: .Lcfi760: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi621: +; NoVLX-NEXT: .Lcfi761: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi622: +; NoVLX-NEXT: .Lcfi762: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi623: +; NoVLX-NEXT: .Lcfi763: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20086,11 +20650,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20102,15 +20666,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20149,12 +20709,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi624: +; NoVLX-NEXT: .Lcfi764: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi625: +; NoVLX-NEXT: .Lcfi765: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi626: +; NoVLX-NEXT: .Lcfi766: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20163,18 +20723,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi627: +; NoVLX-NEXT: .Lcfi767: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi628: +; NoVLX-NEXT: .Lcfi768: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi629: +; NoVLX-NEXT: .Lcfi769: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi630: +; NoVLX-NEXT: .Lcfi770: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi631: +; NoVLX-NEXT: .Lcfi771: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20217,11 +20781,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20233,15 +20797,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20281,12 +20841,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi632: +; NoVLX-NEXT: .Lcfi772: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi633: +; NoVLX-NEXT: .Lcfi773: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi634: +; NoVLX-NEXT: .Lcfi774: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20295,18 +20855,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi635: +; NoVLX-NEXT: .Lcfi775: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi636: +; NoVLX-NEXT: .Lcfi776: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi637: +; NoVLX-NEXT: .Lcfi777: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi638: +; NoVLX-NEXT: .Lcfi778: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi639: +; NoVLX-NEXT: .Lcfi779: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20349,11 +20913,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20365,15 +20929,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20414,12 +20974,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi640: +; NoVLX-NEXT: .Lcfi780: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi641: +; NoVLX-NEXT: .Lcfi781: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi642: +; NoVLX-NEXT: .Lcfi782: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20428,17 +20988,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi643: +; NoVLX-NEXT: .Lcfi783: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi644: +; NoVLX-NEXT: .Lcfi784: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi645: +; NoVLX-NEXT: .Lcfi785: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi646: +; NoVLX-NEXT: .Lcfi786: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi647: +; NoVLX-NEXT: .Lcfi787: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20481,11 +21045,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20497,15 +21061,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20545,12 +21105,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi648: +; NoVLX-NEXT: .Lcfi788: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi649: +; NoVLX-NEXT: .Lcfi789: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi650: +; NoVLX-NEXT: .Lcfi790: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20559,18 +21119,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi651: +; NoVLX-NEXT: .Lcfi791: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi652: +; NoVLX-NEXT: .Lcfi792: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi653: +; NoVLX-NEXT: .Lcfi793: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi654: +; NoVLX-NEXT: .Lcfi794: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi655: +; NoVLX-NEXT: .Lcfi795: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -20613,11 +21177,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -20629,15 +21193,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -20972,7 +21532,6 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: ; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -20980,9 +21539,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -21123,7 +21683,6 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -21131,9 +21690,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -21458,12 +22018,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi656: +; NoVLX-NEXT: .Lcfi796: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi657: +; NoVLX-NEXT: .Lcfi797: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi658: +; NoVLX-NEXT: .Lcfi798: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21501,12 +22061,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi659: +; NoVLX-NEXT: .Lcfi799: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi660: +; NoVLX-NEXT: .Lcfi800: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi661: +; NoVLX-NEXT: .Lcfi801: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21546,15 +22106,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi662: +; NoVLX-NEXT: .Lcfi802: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi663: +; NoVLX-NEXT: .Lcfi803: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi664: +; NoVLX-NEXT: .Lcfi804: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -21562,10 +22123,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -21603,12 +22163,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi665: +; NoVLX-NEXT: .Lcfi805: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi666: +; NoVLX-NEXT: .Lcfi806: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi667: +; NoVLX-NEXT: .Lcfi807: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21661,12 +22221,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi668: +; NoVLX-NEXT: .Lcfi808: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi669: +; NoVLX-NEXT: .Lcfi809: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi670: +; NoVLX-NEXT: .Lcfi810: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -21708,16 +22268,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi671: +; NoVLX-NEXT: .Lcfi811: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi672: +; NoVLX-NEXT: .Lcfi812: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi673: +; NoVLX-NEXT: .Lcfi813: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -21725,10 +22286,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -21768,20 +22328,20 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi674: +; NoVLX-NEXT: .Lcfi814: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi675: +; NoVLX-NEXT: .Lcfi815: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi676: +; NoVLX-NEXT: .Lcfi816: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21817,20 +22377,20 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi677: +; NoVLX-NEXT: .Lcfi817: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi678: +; NoVLX-NEXT: .Lcfi818: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi679: +; NoVLX-NEXT: .Lcfi819: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21868,12 +22428,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi680: +; NoVLX-NEXT: .Lcfi820: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi681: +; NoVLX-NEXT: .Lcfi821: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi682: +; NoVLX-NEXT: .Lcfi822: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -21890,8 +22450,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21931,12 +22491,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi683: +; NoVLX-NEXT: .Lcfi823: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi684: +; NoVLX-NEXT: .Lcfi824: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi685: +; NoVLX-NEXT: .Lcfi825: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -21953,8 +22513,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -21995,12 +22555,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi686: +; NoVLX-NEXT: .Lcfi826: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi687: +; NoVLX-NEXT: .Lcfi827: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi688: +; NoVLX-NEXT: .Lcfi828: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22008,8 +22568,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -22048,12 +22608,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi689: +; NoVLX-NEXT: .Lcfi829: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi690: +; NoVLX-NEXT: .Lcfi830: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi691: +; NoVLX-NEXT: .Lcfi831: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22071,8 +22631,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -22119,8 +22679,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22176,85 +22736,85 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <4 x i64> + %load = load <4 x i64>, <4 x i64>* %__b + %1 = bitcast <4 x i64> %load to <4 x i64> + %2 = icmp sgt <4 x i64> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k0 -; NoVLX-NEXT: kshiftlw $7, %k0, %k0 -; NoVLX-NEXT: kshiftrw $7, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kshiftrw $1, %k1, %k1 -; NoVLX-NEXT: kshiftlw $1, %k1, %k1 -; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $12, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 -; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: # kill: %AL %AL %EAX -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <4 x i64> - %load = load <4 x i64>, <4 x i64>* %__b - %1 = bitcast <4 x i64> %load to <4 x i64> - %2 = icmp sgt <4 x i64> %0, %1 - %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> - %4 = bitcast <8 x i1> %3 to i8 - ret i8 %4 -} - -define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: -; VLX: # BB#0: # %entry -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; VLX-NEXT: kmovd %k0, %eax -; VLX-NEXT: # kill: %AL %AL %EAX -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 -; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 -; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpextrb $4, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22312,28 +22872,28 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22395,8 +22955,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22454,28 +23014,28 @@ ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22945,12 +23505,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi692: +; NoVLX-NEXT: .Lcfi832: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi693: +; NoVLX-NEXT: .Lcfi833: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi694: +; NoVLX-NEXT: .Lcfi834: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22990,12 +23550,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi695: +; NoVLX-NEXT: .Lcfi835: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi696: +; NoVLX-NEXT: .Lcfi836: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi697: +; NoVLX-NEXT: .Lcfi837: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23037,34 +23597,34 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi698: +; NoVLX-NEXT: .Lcfi838: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi699: +; NoVLX-NEXT: .Lcfi839: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi700: +; NoVLX-NEXT: .Lcfi840: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -23104,34 +23664,34 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi701: +; NoVLX-NEXT: .Lcfi841: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi702: +; NoVLX-NEXT: .Lcfi842: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi703: +; NoVLX-NEXT: .Lcfi843: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -23172,12 +23732,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi704: +; NoVLX-NEXT: .Lcfi844: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi705: +; NoVLX-NEXT: .Lcfi845: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi706: +; NoVLX-NEXT: .Lcfi846: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23221,35 +23781,35 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi707: +; NoVLX-NEXT: .Lcfi847: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi708: +; NoVLX-NEXT: .Lcfi848: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi709: +; NoVLX-NEXT: .Lcfi849: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -23291,12 +23851,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi710: +; NoVLX-NEXT: .Lcfi850: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi711: +; NoVLX-NEXT: .Lcfi851: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi712: +; NoVLX-NEXT: .Lcfi852: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23304,8 +23864,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23342,12 +23902,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi713: +; NoVLX-NEXT: .Lcfi853: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi714: +; NoVLX-NEXT: .Lcfi854: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi715: +; NoVLX-NEXT: .Lcfi855: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23355,8 +23915,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23395,39 +23955,39 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi716: +; NoVLX-NEXT: .Lcfi856: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi717: +; NoVLX-NEXT: .Lcfi857: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi718: +; NoVLX-NEXT: .Lcfi858: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23468,39 +24028,39 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi719: +; NoVLX-NEXT: .Lcfi859: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi720: +; NoVLX-NEXT: .Lcfi860: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi721: +; NoVLX-NEXT: .Lcfi861: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23542,12 +24102,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi722: +; NoVLX-NEXT: .Lcfi862: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi723: +; NoVLX-NEXT: .Lcfi863: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi724: +; NoVLX-NEXT: .Lcfi864: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23556,8 +24116,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23597,40 +24157,40 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi725: +; NoVLX-NEXT: .Lcfi865: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi726: +; NoVLX-NEXT: .Lcfi866: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi727: +; NoVLX-NEXT: .Lcfi867: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -23837,18 +24397,16 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi728: +; NoVLX-NEXT: .Lcfi868: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi729: +; NoVLX-NEXT: .Lcfi869: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi730: +; NoVLX-NEXT: .Lcfi870: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -23882,6 +24440,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -23910,18 +24470,16 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi731: +; NoVLX-NEXT: .Lcfi871: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi732: +; NoVLX-NEXT: .Lcfi872: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi733: +; NoVLX-NEXT: .Lcfi873: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -23955,6 +24513,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -23985,19 +24545,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi734: +; NoVLX-NEXT: .Lcfi874: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi735: +; NoVLX-NEXT: .Lcfi875: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi736: +; NoVLX-NEXT: .Lcfi876: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -24031,6 +24589,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24062,19 +24622,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi737: +; NoVLX-NEXT: .Lcfi877: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi738: +; NoVLX-NEXT: .Lcfi878: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi739: +; NoVLX-NEXT: .Lcfi879: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -24108,6 +24666,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24140,18 +24700,16 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi740: +; NoVLX-NEXT: .Lcfi880: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi741: +; NoVLX-NEXT: .Lcfi881: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi742: +; NoVLX-NEXT: .Lcfi882: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -24185,6 +24743,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24216,19 +24776,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi743: +; NoVLX-NEXT: .Lcfi883: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi744: +; NoVLX-NEXT: .Lcfi884: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi745: +; NoVLX-NEXT: .Lcfi885: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -24262,6 +24820,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24295,53 +24855,53 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi746: +; NoVLX-NEXT: .Lcfi886: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi747: +; NoVLX-NEXT: .Lcfi887: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi748: +; NoVLX-NEXT: .Lcfi888: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24373,53 +24933,53 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi749: +; NoVLX-NEXT: .Lcfi889: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi750: +; NoVLX-NEXT: .Lcfi890: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi751: +; NoVLX-NEXT: .Lcfi891: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24453,54 +25013,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi752: +; NoVLX-NEXT: .Lcfi892: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi753: +; NoVLX-NEXT: .Lcfi893: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi754: +; NoVLX-NEXT: .Lcfi894: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24535,54 +25095,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi755: +; NoVLX-NEXT: .Lcfi895: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi756: +; NoVLX-NEXT: .Lcfi896: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi757: +; NoVLX-NEXT: .Lcfi897: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24618,53 +25178,53 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi758: +; NoVLX-NEXT: .Lcfi898: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi759: +; NoVLX-NEXT: .Lcfi899: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi760: +; NoVLX-NEXT: .Lcfi900: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24699,54 +25259,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi761: +; NoVLX-NEXT: .Lcfi901: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi762: +; NoVLX-NEXT: .Lcfi902: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi763: +; NoVLX-NEXT: .Lcfi903: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -24782,92 +25342,112 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi764: +; NoVLX-NEXT: .Lcfi904: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi765: +; NoVLX-NEXT: .Lcfi905: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi766: +; NoVLX-NEXT: .Lcfi906: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi907: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi908: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi909: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi910: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi911: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -24889,15 +25469,30 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi767: +; NoVLX-NEXT: .Lcfi912: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi768: +; NoVLX-NEXT: .Lcfi913: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi769: +; NoVLX-NEXT: .Lcfi914: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi915: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi916: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi917: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi918: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi919: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -24905,77 +25500,82 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -24999,15 +25599,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi770: +; NoVLX-NEXT: .Lcfi920: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi771: +; NoVLX-NEXT: .Lcfi921: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi772: +; NoVLX-NEXT: .Lcfi922: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi923: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi924: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi925: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi926: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi927: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -25015,77 +25630,82 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -25110,15 +25730,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi773: +; NoVLX-NEXT: .Lcfi928: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi774: +; NoVLX-NEXT: .Lcfi929: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi775: +; NoVLX-NEXT: .Lcfi930: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi931: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi932: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi933: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi934: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi935: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25127,77 +25762,82 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -25223,12 +25863,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi776: +; NoVLX-NEXT: .Lcfi936: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi777: +; NoVLX-NEXT: .Lcfi937: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi778: +; NoVLX-NEXT: .Lcfi938: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25237,15 +25877,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi779: +; NoVLX-NEXT: .Lcfi939: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi780: +; NoVLX-NEXT: .Lcfi940: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi781: +; NoVLX-NEXT: .Lcfi941: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi782: +; NoVLX-NEXT: .Lcfi942: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi783: +; NoVLX-NEXT: .Lcfi943: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25253,6 +25893,10 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25295,11 +25939,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25311,15 +25955,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25355,12 +25995,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi784: +; NoVLX-NEXT: .Lcfi944: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi785: +; NoVLX-NEXT: .Lcfi945: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi786: +; NoVLX-NEXT: .Lcfi946: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25369,15 +26009,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi787: +; NoVLX-NEXT: .Lcfi947: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi788: +; NoVLX-NEXT: .Lcfi948: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi789: +; NoVLX-NEXT: .Lcfi949: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi790: +; NoVLX-NEXT: .Lcfi950: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi791: +; NoVLX-NEXT: .Lcfi951: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 @@ -25386,6 +26026,10 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25428,11 +26072,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25444,15 +26088,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25490,12 +26130,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi792: +; NoVLX-NEXT: .Lcfi952: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi793: +; NoVLX-NEXT: .Lcfi953: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi794: +; NoVLX-NEXT: .Lcfi954: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25504,15 +26144,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi795: +; NoVLX-NEXT: .Lcfi955: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi796: +; NoVLX-NEXT: .Lcfi956: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi797: +; NoVLX-NEXT: .Lcfi957: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi798: +; NoVLX-NEXT: .Lcfi958: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi799: +; NoVLX-NEXT: .Lcfi959: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25521,6 +26161,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25563,11 +26207,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25579,15 +26223,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25626,12 +26266,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi800: +; NoVLX-NEXT: .Lcfi960: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi801: +; NoVLX-NEXT: .Lcfi961: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi802: +; NoVLX-NEXT: .Lcfi962: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25640,15 +26280,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi803: +; NoVLX-NEXT: .Lcfi963: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi804: +; NoVLX-NEXT: .Lcfi964: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi805: +; NoVLX-NEXT: .Lcfi965: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi806: +; NoVLX-NEXT: .Lcfi966: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi807: +; NoVLX-NEXT: .Lcfi967: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 @@ -25658,6 +26298,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -25700,11 +26344,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -25716,15 +26360,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -25765,12 +26405,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi808: +; NoVLX-NEXT: .Lcfi968: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi809: +; NoVLX-NEXT: .Lcfi969: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi810: +; NoVLX-NEXT: .Lcfi970: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25816,12 +26456,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi811: +; NoVLX-NEXT: .Lcfi971: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi812: +; NoVLX-NEXT: .Lcfi972: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi813: +; NoVLX-NEXT: .Lcfi973: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25870,12 +26510,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi814: +; NoVLX-NEXT: .Lcfi974: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi815: +; NoVLX-NEXT: .Lcfi975: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi816: +; NoVLX-NEXT: .Lcfi976: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -25933,12 +26573,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi817: +; NoVLX-NEXT: .Lcfi977: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi818: +; NoVLX-NEXT: .Lcfi978: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi819: +; NoVLX-NEXT: .Lcfi979: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -26130,12 +26770,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi820: +; NoVLX-NEXT: .Lcfi980: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi821: +; NoVLX-NEXT: .Lcfi981: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi822: +; NoVLX-NEXT: .Lcfi982: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26145,8 +26785,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -26180,6 +26818,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26207,12 +26847,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi823: +; NoVLX-NEXT: .Lcfi983: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi824: +; NoVLX-NEXT: .Lcfi984: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi825: +; NoVLX-NEXT: .Lcfi985: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26223,8 +26863,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -26258,6 +26896,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26287,12 +26927,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi826: +; NoVLX-NEXT: .Lcfi986: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi827: +; NoVLX-NEXT: .Lcfi987: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi828: +; NoVLX-NEXT: .Lcfi988: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26303,8 +26943,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -26338,6 +26976,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26368,12 +27008,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi829: +; NoVLX-NEXT: .Lcfi989: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi830: +; NoVLX-NEXT: .Lcfi990: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi831: +; NoVLX-NEXT: .Lcfi991: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26385,8 +27025,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -26420,6 +27058,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26451,12 +27091,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi832: +; NoVLX-NEXT: .Lcfi992: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi833: +; NoVLX-NEXT: .Lcfi993: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi834: +; NoVLX-NEXT: .Lcfi994: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26466,43 +27106,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26533,12 +27173,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi835: +; NoVLX-NEXT: .Lcfi995: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi836: +; NoVLX-NEXT: .Lcfi996: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi837: +; NoVLX-NEXT: .Lcfi997: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26549,43 +27189,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26618,12 +27258,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi838: +; NoVLX-NEXT: .Lcfi998: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi839: +; NoVLX-NEXT: .Lcfi999: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi840: +; NoVLX-NEXT: .Lcfi1000: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26634,43 +27274,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26704,12 +27344,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi841: +; NoVLX-NEXT: .Lcfi1001: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi842: +; NoVLX-NEXT: .Lcfi1002: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi843: +; NoVLX-NEXT: .Lcfi1003: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26721,43 +27361,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -26793,92 +27433,112 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi844: +; NoVLX-NEXT: .Lcfi1004: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi845: +; NoVLX-NEXT: .Lcfi1005: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi846: +; NoVLX-NEXT: .Lcfi1006: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1007: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1008: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1009: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1010: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1011: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -26901,15 +27561,30 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi847: +; NoVLX-NEXT: .Lcfi1012: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi848: +; NoVLX-NEXT: .Lcfi1013: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi849: +; NoVLX-NEXT: .Lcfi1014: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1015: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1016: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1017: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1018: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1019: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -26917,77 +27592,82 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -27012,15 +27692,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi850: +; NoVLX-NEXT: .Lcfi1020: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi851: +; NoVLX-NEXT: .Lcfi1021: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi852: +; NoVLX-NEXT: .Lcfi1022: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1023: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1024: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1025: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1026: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1027: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -27028,77 +27723,82 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -27124,15 +27824,30 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi853: +; NoVLX-NEXT: .Lcfi1028: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi854: +; NoVLX-NEXT: .Lcfi1029: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi855: +; NoVLX-NEXT: .Lcfi1030: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1031: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1032: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1033: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1034: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1035: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27141,77 +27856,82 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -27238,12 +27958,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi856: +; NoVLX-NEXT: .Lcfi1036: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi857: +; NoVLX-NEXT: .Lcfi1037: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi858: +; NoVLX-NEXT: .Lcfi1038: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27252,15 +27972,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi859: +; NoVLX-NEXT: .Lcfi1039: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi860: +; NoVLX-NEXT: .Lcfi1040: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi861: +; NoVLX-NEXT: .Lcfi1041: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi862: +; NoVLX-NEXT: .Lcfi1042: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi863: +; NoVLX-NEXT: .Lcfi1043: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27268,6 +27988,10 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27310,11 +28034,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27326,15 +28050,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27371,12 +28091,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi864: +; NoVLX-NEXT: .Lcfi1044: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi865: +; NoVLX-NEXT: .Lcfi1045: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi866: +; NoVLX-NEXT: .Lcfi1046: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27385,15 +28105,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi867: +; NoVLX-NEXT: .Lcfi1047: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi868: +; NoVLX-NEXT: .Lcfi1048: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi869: +; NoVLX-NEXT: .Lcfi1049: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi870: +; NoVLX-NEXT: .Lcfi1050: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi871: +; NoVLX-NEXT: .Lcfi1051: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 @@ -27402,6 +28122,10 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27444,11 +28168,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27460,15 +28184,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27507,12 +28227,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi872: +; NoVLX-NEXT: .Lcfi1052: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi873: +; NoVLX-NEXT: .Lcfi1053: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi874: +; NoVLX-NEXT: .Lcfi1054: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27521,15 +28241,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi875: +; NoVLX-NEXT: .Lcfi1055: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi876: +; NoVLX-NEXT: .Lcfi1056: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi877: +; NoVLX-NEXT: .Lcfi1057: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi878: +; NoVLX-NEXT: .Lcfi1058: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi879: +; NoVLX-NEXT: .Lcfi1059: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27538,6 +28258,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27580,11 +28304,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27596,15 +28320,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27644,12 +28364,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi880: +; NoVLX-NEXT: .Lcfi1060: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi881: +; NoVLX-NEXT: .Lcfi1061: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi882: +; NoVLX-NEXT: .Lcfi1062: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27658,15 +28378,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi883: +; NoVLX-NEXT: .Lcfi1063: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi884: +; NoVLX-NEXT: .Lcfi1064: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi885: +; NoVLX-NEXT: .Lcfi1065: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi886: +; NoVLX-NEXT: .Lcfi1066: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi887: +; NoVLX-NEXT: .Lcfi1067: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 @@ -27676,6 +28396,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -27718,11 +28442,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -27734,15 +28458,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -27783,12 +28503,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi888: +; NoVLX-NEXT: .Lcfi1068: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi889: +; NoVLX-NEXT: .Lcfi1069: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi890: +; NoVLX-NEXT: .Lcfi1070: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -27933,110 +28653,108 @@ ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm1, %rax -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm0 +; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm4 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm4, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm0 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2 -; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -28103,6 +28821,8 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -28137,12 +28857,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi891: +; NoVLX-NEXT: .Lcfi1071: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi892: +; NoVLX-NEXT: .Lcfi1072: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi893: +; NoVLX-NEXT: .Lcfi1073: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -28227,9 +28947,9 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm0 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 @@ -28305,8 +29025,6 @@ ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -28374,6 +29092,8 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -28410,12 +29130,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi894: +; NoVLX-NEXT: .Lcfi1074: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi895: +; NoVLX-NEXT: .Lcfi1075: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi896: +; NoVLX-NEXT: .Lcfi1076: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -28442,58 +29162,58 @@ ; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -28501,31 +29221,31 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm6 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm7 @@ -28533,20 +29253,20 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 ; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm7 ; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 @@ -28565,32 +29285,36 @@ ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 ; NoVLX-NEXT: vmovq %xmm1, %rax ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7 +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm7, %xmm7 +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm8 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm2, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm2 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm2, %ymm2 ; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 ; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 @@ -28657,83 +29381,79 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm8, %ymm3, %ymm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 @@ -28776,12 +29496,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi897: +; NoVLX-NEXT: .Lcfi1077: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi898: +; NoVLX-NEXT: .Lcfi1078: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi899: +; NoVLX-NEXT: .Lcfi1079: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -28870,153 +29590,153 @@ ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 +; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm2 ; NoVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpxor %ymm3, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm4 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm5 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 -; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpxor %ymm3, %ymm5, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2 ; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -29064,8 +29784,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -29122,8 +29842,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -29177,28 +29897,28 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -29255,170 +29975,170 @@ ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %4 = and <4 x i1> %2, %extract.i + %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> + %6 = bitcast <8 x i1> %5 to i8 + ret i8 %6 +} + + +define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL %AL %EAX +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <4 x i32> + %load = load i32, i32* %__b + %vec = insertelement <4 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> + %2 = icmp sge <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + ret i8 %4 +} + +define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL %AL %EAX +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $0, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k0 -; NoVLX-NEXT: kshiftlw $7, %k0, %k0 -; NoVLX-NEXT: kshiftrw $7, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kshiftrw $1, %k1, %k1 -; NoVLX-NEXT: kshiftlw $1, %k1, %k1 -; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $12, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 -; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: # kill: %AL %AL %EAX -; NoVLX-NEXT: retq -entry: - %0 = bitcast <2 x i64> %__a to <4 x i32> - %load = load <2 x i64>, <2 x i64>* %__b - %1 = bitcast <2 x i64> %load to <4 x i32> - %2 = icmp sge <4 x i32> %0, %1 - %3 = bitcast i8 %__u to <8 x i1> - %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> - %4 = and <4 x i1> %2, %extract.i - %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> - %6 = bitcast <8 x i1> %5 to i8 - ret i8 %6 -} - - -define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 -; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; VLX-NEXT: kmovd %k0, %eax -; VLX-NEXT: # kill: %AL %AL %EAX -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpextrb $4, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $0, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k0 -; NoVLX-NEXT: kshiftlw $7, %k0, %k0 -; NoVLX-NEXT: kshiftrw $7, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kshiftrw $1, %k1, %k1 -; NoVLX-NEXT: kshiftlw $1, %k1, %k1 -; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpextrb $8, %xmm0, %eax -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 -; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpextrb $12, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 -; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: # kill: %AL %AL %EAX -; NoVLX-NEXT: retq -entry: - %0 = bitcast <2 x i64> %__a to <4 x i32> - %load = load i32, i32* %__b - %vec = insertelement <4 x i32> undef, i32 %load, i32 0 - %1 = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> - %2 = icmp sge <4 x i32> %0, %1 - %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> - %4 = bitcast <8 x i1> %3 to i8 - ret i8 %4 -} - -define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; VLX-NEXT: kmovd %k0, %eax -; VLX-NEXT: # kill: %AL %AL %EAX -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 -; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 -; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpextrb $4, %xmm0, %eax -; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -29885,12 +30605,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi900: +; NoVLX-NEXT: .Lcfi1080: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi901: +; NoVLX-NEXT: .Lcfi1081: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi902: +; NoVLX-NEXT: .Lcfi1082: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -29930,12 +30650,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi903: +; NoVLX-NEXT: .Lcfi1083: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi904: +; NoVLX-NEXT: .Lcfi1084: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi905: +; NoVLX-NEXT: .Lcfi1085: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -29978,34 +30698,34 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi906: +; NoVLX-NEXT: .Lcfi1086: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi907: +; NoVLX-NEXT: .Lcfi1087: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi908: +; NoVLX-NEXT: .Lcfi1088: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -30043,35 +30763,35 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi909: +; NoVLX-NEXT: .Lcfi1089: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi910: +; NoVLX-NEXT: .Lcfi1090: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi911: +; NoVLX-NEXT: .Lcfi1091: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -30111,12 +30831,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi912: +; NoVLX-NEXT: .Lcfi1092: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi913: +; NoVLX-NEXT: .Lcfi1093: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi914: +; NoVLX-NEXT: .Lcfi1094: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30161,35 +30881,35 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi915: +; NoVLX-NEXT: .Lcfi1095: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi916: +; NoVLX-NEXT: .Lcfi1096: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi917: +; NoVLX-NEXT: .Lcfi1097: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -30229,12 +30949,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi918: +; NoVLX-NEXT: .Lcfi1098: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi919: +; NoVLX-NEXT: .Lcfi1099: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi920: +; NoVLX-NEXT: .Lcfi1100: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30243,8 +30963,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30280,12 +31000,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi921: +; NoVLX-NEXT: .Lcfi1101: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi922: +; NoVLX-NEXT: .Lcfi1102: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi923: +; NoVLX-NEXT: .Lcfi1103: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30295,8 +31015,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30334,38 +31054,38 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi924: +; NoVLX-NEXT: .Lcfi1104: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi925: +; NoVLX-NEXT: .Lcfi1105: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi926: +; NoVLX-NEXT: .Lcfi1106: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30405,39 +31125,39 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi927: +; NoVLX-NEXT: .Lcfi1107: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi928: +; NoVLX-NEXT: .Lcfi1108: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi929: +; NoVLX-NEXT: .Lcfi1109: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30479,12 +31199,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi930: +; NoVLX-NEXT: .Lcfi1110: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi931: +; NoVLX-NEXT: .Lcfi1111: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi932: +; NoVLX-NEXT: .Lcfi1112: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -30494,8 +31214,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30535,39 +31255,39 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi933: +; NoVLX-NEXT: .Lcfi1113: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi934: +; NoVLX-NEXT: .Lcfi1114: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi935: +; NoVLX-NEXT: .Lcfi1115: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -30800,20 +31520,18 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi936: +; NoVLX-NEXT: .Lcfi1116: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi937: +; NoVLX-NEXT: .Lcfi1117: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi938: +; NoVLX-NEXT: .Lcfi1118: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -30847,6 +31565,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -30875,20 +31595,18 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi939: +; NoVLX-NEXT: .Lcfi1119: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi940: +; NoVLX-NEXT: .Lcfi1120: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi941: +; NoVLX-NEXT: .Lcfi1121: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -30922,6 +31640,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -30952,12 +31672,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi942: +; NoVLX-NEXT: .Lcfi1122: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi943: +; NoVLX-NEXT: .Lcfi1123: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi944: +; NoVLX-NEXT: .Lcfi1124: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30966,8 +31686,6 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -31001,6 +31719,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31032,12 +31752,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi945: +; NoVLX-NEXT: .Lcfi1125: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi946: +; NoVLX-NEXT: .Lcfi1126: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi947: +; NoVLX-NEXT: .Lcfi1127: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31046,8 +31766,6 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -31081,6 +31799,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31114,20 +31834,183 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi948: +; NoVLX-NEXT: .Lcfi1128: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi949: +; NoVLX-NEXT: .Lcfi1129: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi950: +; NoVLX-NEXT: .Lcfi1130: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> + %4 = bitcast <32 x i1> %3 to i32 + ret i32 %4 +} + +define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1131: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1132: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1133: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__a to <8 x i32> + %load = load i32, i32* %__b + %vec = insertelement <8 x i32> undef, i32 %load, i32 0 + %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %2 = icmp sge <8 x i32> %0, %1 + %3 = bitcast i8 %__u to <8 x i1> + %4 = and <8 x i1> %3, %2 + %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1134: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1135: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1136: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 +; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -31155,9 +32038,9 @@ ; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 @@ -31165,50 +32048,49 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax ; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> - %load = load i32, i32* %__b - %vec = insertelement <8 x i32> undef, i32 %load, i32 0 - %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> + %1 = bitcast <4 x i64> %__b to <8 x i32> %2 = icmp sge <8 x i32> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> - %4 = bitcast <32 x i1> %3 to i32 - ret i32 %4 + %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 } -define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: ; VLX: # BB#0: # %entry -; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 -; VLX-NEXT: kmovd %edi, %k1 -; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq ; -; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi951: +; NoVLX-NEXT: .Lcfi1137: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi952: +; NoVLX-NEXT: .Lcfi1138: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi953: +; NoVLX-NEXT: .Lcfi1139: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: kandw %k0, %k1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -31246,168 +32128,6 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <8 x i32> - %load = load i32, i32* %__b - %vec = insertelement <8 x i32> undef, i32 %load, i32 0 - %1 = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> - %2 = icmp sge <8 x i32> %0, %1 - %3 = bitcast i8 %__u to <8 x i1> - %4 = and <8 x i1> %3, %2 - %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> - %6 = bitcast <32 x i1> %5 to i32 - ret i32 %6 -} - - -define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi954: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi955: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi956: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 -; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <4 x i64> %__a to <8 x i32> - %1 = bitcast <4 x i64> %__b to <8 x i32> - %2 = icmp sge <8 x i32> %0, %1 - %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 -} - -define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi957: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi958: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi959: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 -; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax @@ -31437,12 +32157,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi960: +; NoVLX-NEXT: .Lcfi1140: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi961: +; NoVLX-NEXT: .Lcfi1141: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi962: +; NoVLX-NEXT: .Lcfi1142: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31451,43 +32171,43 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31522,12 +32242,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi963: +; NoVLX-NEXT: .Lcfi1143: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi964: +; NoVLX-NEXT: .Lcfi1144: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi965: +; NoVLX-NEXT: .Lcfi1145: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31536,43 +32256,43 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31609,55 +32329,55 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi966: +; NoVLX-NEXT: .Lcfi1146: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi967: +; NoVLX-NEXT: .Lcfi1147: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi968: +; NoVLX-NEXT: .Lcfi1148: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31693,12 +32413,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi969: +; NoVLX-NEXT: .Lcfi1149: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi970: +; NoVLX-NEXT: .Lcfi1150: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi971: +; NoVLX-NEXT: .Lcfi1151: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31707,43 +32427,43 @@ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -31780,87 +32500,107 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi972: +; NoVLX-NEXT: .Lcfi1152: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi973: +; NoVLX-NEXT: .Lcfi1153: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi974: +; NoVLX-NEXT: .Lcfi1154: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1155: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1156: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1157: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1158: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1159: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -31883,87 +32623,107 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi975: +; NoVLX-NEXT: .Lcfi1160: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi976: +; NoVLX-NEXT: .Lcfi1161: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi977: +; NoVLX-NEXT: .Lcfi1162: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1163: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1164: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1165: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1166: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1167: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -31988,88 +32748,108 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi978: +; NoVLX-NEXT: .Lcfi1168: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi979: +; NoVLX-NEXT: .Lcfi1169: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi980: +; NoVLX-NEXT: .Lcfi1170: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1171: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1172: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1173: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1174: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1175: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -32095,88 +32875,108 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi981: +; NoVLX-NEXT: .Lcfi1176: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi982: +; NoVLX-NEXT: .Lcfi1177: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi983: +; NoVLX-NEXT: .Lcfi1178: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1179: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1180: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1181: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1182: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1183: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -32204,88 +33004,108 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi984: +; NoVLX-NEXT: .Lcfi1184: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi985: +; NoVLX-NEXT: .Lcfi1185: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi986: +; NoVLX-NEXT: .Lcfi1186: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1187: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1188: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1189: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1190: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1191: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -32312,89 +33132,109 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi987: +; NoVLX-NEXT: .Lcfi1192: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi988: +; NoVLX-NEXT: .Lcfi1193: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi989: +; NoVLX-NEXT: .Lcfi1194: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1195: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1196: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1197: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1198: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1199: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -32422,12 +33262,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi990: +; NoVLX-NEXT: .Lcfi1200: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi991: +; NoVLX-NEXT: .Lcfi1201: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi992: +; NoVLX-NEXT: .Lcfi1202: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32436,17 +33276,149 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi993: +; NoVLX-NEXT: .Lcfi1203: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi994: +; NoVLX-NEXT: .Lcfi1204: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi995: +; NoVLX-NEXT: .Lcfi1205: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi996: +; NoVLX-NEXT: .Lcfi1206: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi997: +; NoVLX-NEXT: .Lcfi1207: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__a to <16 x i32> + %1 = bitcast <8 x i64> %__b to <16 x i32> + %2 = icmp sge <16 x i32> %0, %1 + %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> + %4 = bitcast <64 x i1> %3 to i64 + ret i64 %4 +} + +define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1208: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1209: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1210: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1211: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1212: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1213: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1214: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1215: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -32489,139 +33461,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; NoVLX-NEXT: shlq $32, %rcx -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <8 x i64> %__a to <16 x i32> - %1 = bitcast <8 x i64> %__b to <16 x i32> - %2 = icmp sge <16 x i32> %0, %1 - %3 = shufflevector <16 x i1> %2, <16 x i1> zeroinitializer, <64 x i32> - %4 = bitcast <64 x i1> %3 to i64 - ret i64 %4 -} - -define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: vzeroupper -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi998: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi999: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1000: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1001: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1002: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1003: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1004: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1005: -; NoVLX-NEXT: .cfi_offset %r15, -24 -; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r10d -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d -; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d -; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi -; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx -; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi -; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx -; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -32633,15 +33477,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -32680,12 +33520,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1006: +; NoVLX-NEXT: .Lcfi1216: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1007: +; NoVLX-NEXT: .Lcfi1217: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1008: +; NoVLX-NEXT: .Lcfi1218: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32694,18 +33534,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1009: +; NoVLX-NEXT: .Lcfi1219: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1010: +; NoVLX-NEXT: .Lcfi1220: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1011: +; NoVLX-NEXT: .Lcfi1221: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1012: +; NoVLX-NEXT: .Lcfi1222: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1013: +; NoVLX-NEXT: .Lcfi1223: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -32748,11 +33592,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -32764,15 +33608,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -32812,12 +33652,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1014: +; NoVLX-NEXT: .Lcfi1224: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1015: +; NoVLX-NEXT: .Lcfi1225: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1016: +; NoVLX-NEXT: .Lcfi1226: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32826,18 +33666,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1017: +; NoVLX-NEXT: .Lcfi1227: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1018: +; NoVLX-NEXT: .Lcfi1228: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1019: +; NoVLX-NEXT: .Lcfi1229: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1020: +; NoVLX-NEXT: .Lcfi1230: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1021: +; NoVLX-NEXT: .Lcfi1231: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -32880,11 +33724,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -32896,15 +33740,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -32946,12 +33786,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1022: +; NoVLX-NEXT: .Lcfi1232: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1023: +; NoVLX-NEXT: .Lcfi1233: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1024: +; NoVLX-NEXT: .Lcfi1234: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32960,18 +33800,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1025: +; NoVLX-NEXT: .Lcfi1235: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1026: +; NoVLX-NEXT: .Lcfi1236: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1027: +; NoVLX-NEXT: .Lcfi1237: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1028: +; NoVLX-NEXT: .Lcfi1238: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1029: +; NoVLX-NEXT: .Lcfi1239: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -33014,11 +33858,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -33030,15 +33874,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -33079,12 +33919,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1030: +; NoVLX-NEXT: .Lcfi1240: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1031: +; NoVLX-NEXT: .Lcfi1241: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1032: +; NoVLX-NEXT: .Lcfi1242: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -33093,19 +33933,23 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1033: +; NoVLX-NEXT: .Lcfi1243: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1034: +; NoVLX-NEXT: .Lcfi1244: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1035: +; NoVLX-NEXT: .Lcfi1245: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1036: +; NoVLX-NEXT: .Lcfi1246: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1037: +; NoVLX-NEXT: .Lcfi1247: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -33148,11 +33992,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -33164,15 +34008,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -33522,7 +34362,6 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: ; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -33530,9 +34369,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -33576,7 +34416,6 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -33584,9 +34423,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -33678,7 +34518,6 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -33686,9 +34525,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -34023,12 +34863,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1038: +; NoVLX-NEXT: .Lcfi1248: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1039: +; NoVLX-NEXT: .Lcfi1249: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1040: +; NoVLX-NEXT: .Lcfi1250: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34068,12 +34908,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1041: +; NoVLX-NEXT: .Lcfi1251: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1042: +; NoVLX-NEXT: .Lcfi1252: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1043: +; NoVLX-NEXT: .Lcfi1253: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34116,15 +34956,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1044: +; NoVLX-NEXT: .Lcfi1254: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1045: +; NoVLX-NEXT: .Lcfi1255: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1046: +; NoVLX-NEXT: .Lcfi1256: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34132,10 +34973,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -34173,16 +35013,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1047: +; NoVLX-NEXT: .Lcfi1257: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1048: +; NoVLX-NEXT: .Lcfi1258: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1049: +; NoVLX-NEXT: .Lcfi1259: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34190,10 +35031,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -34233,12 +35073,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1050: +; NoVLX-NEXT: .Lcfi1260: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1051: +; NoVLX-NEXT: .Lcfi1261: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1052: +; NoVLX-NEXT: .Lcfi1262: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34283,16 +35123,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1053: +; NoVLX-NEXT: .Lcfi1263: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1054: +; NoVLX-NEXT: .Lcfi1264: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1055: +; NoVLX-NEXT: .Lcfi1265: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34300,10 +35141,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -34343,12 +35183,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1056: +; NoVLX-NEXT: .Lcfi1266: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1057: +; NoVLX-NEXT: .Lcfi1267: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1058: +; NoVLX-NEXT: .Lcfi1268: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34357,8 +35197,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34394,12 +35234,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1059: +; NoVLX-NEXT: .Lcfi1269: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1060: +; NoVLX-NEXT: .Lcfi1270: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1061: +; NoVLX-NEXT: .Lcfi1271: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34409,8 +35249,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34448,12 +35288,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1062: +; NoVLX-NEXT: .Lcfi1272: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1063: +; NoVLX-NEXT: .Lcfi1273: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1064: +; NoVLX-NEXT: .Lcfi1274: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34470,8 +35310,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34511,12 +35351,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1065: +; NoVLX-NEXT: .Lcfi1275: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1066: +; NoVLX-NEXT: .Lcfi1276: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1067: +; NoVLX-NEXT: .Lcfi1277: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34534,8 +35374,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34577,12 +35417,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1068: +; NoVLX-NEXT: .Lcfi1278: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1069: +; NoVLX-NEXT: .Lcfi1279: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1070: +; NoVLX-NEXT: .Lcfi1280: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34592,8 +35432,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34633,12 +35473,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1071: +; NoVLX-NEXT: .Lcfi1281: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1072: +; NoVLX-NEXT: .Lcfi1282: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1073: +; NoVLX-NEXT: .Lcfi1283: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -34656,8 +35496,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -34706,8 +35546,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34766,8 +35606,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34825,28 +35665,28 @@ ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34907,28 +35747,28 @@ ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34993,8 +35833,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35055,28 +35895,28 @@ ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35562,12 +36402,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1074: +; NoVLX-NEXT: .Lcfi1284: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1075: +; NoVLX-NEXT: .Lcfi1285: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1076: +; NoVLX-NEXT: .Lcfi1286: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35609,12 +36449,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1077: +; NoVLX-NEXT: .Lcfi1287: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1078: +; NoVLX-NEXT: .Lcfi1288: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1079: +; NoVLX-NEXT: .Lcfi1289: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35659,12 +36499,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1080: +; NoVLX-NEXT: .Lcfi1290: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1081: +; NoVLX-NEXT: .Lcfi1291: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1082: +; NoVLX-NEXT: .Lcfi1292: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35672,23 +36512,23 @@ ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -35728,12 +36568,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1083: +; NoVLX-NEXT: .Lcfi1293: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1084: +; NoVLX-NEXT: .Lcfi1294: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1085: +; NoVLX-NEXT: .Lcfi1295: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35742,23 +36582,23 @@ ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -35800,12 +36640,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1086: +; NoVLX-NEXT: .Lcfi1296: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1087: +; NoVLX-NEXT: .Lcfi1297: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1088: +; NoVLX-NEXT: .Lcfi1298: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35852,12 +36692,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1089: +; NoVLX-NEXT: .Lcfi1299: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1090: +; NoVLX-NEXT: .Lcfi1300: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1091: +; NoVLX-NEXT: .Lcfi1301: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35866,23 +36706,23 @@ ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -35924,12 +36764,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1092: +; NoVLX-NEXT: .Lcfi1302: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1093: +; NoVLX-NEXT: .Lcfi1303: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1094: +; NoVLX-NEXT: .Lcfi1304: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -35939,8 +36779,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -35977,12 +36817,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1095: +; NoVLX-NEXT: .Lcfi1305: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1096: +; NoVLX-NEXT: .Lcfi1306: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1097: +; NoVLX-NEXT: .Lcfi1307: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -35993,8 +36833,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36033,41 +36873,41 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1098: +; NoVLX-NEXT: .Lcfi1308: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1099: +; NoVLX-NEXT: .Lcfi1309: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1100: +; NoVLX-NEXT: .Lcfi1310: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36108,12 +36948,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1101: +; NoVLX-NEXT: .Lcfi1311: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1102: +; NoVLX-NEXT: .Lcfi1312: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1103: +; NoVLX-NEXT: .Lcfi1313: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36121,29 +36961,29 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36186,12 +37026,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1104: +; NoVLX-NEXT: .Lcfi1314: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1105: +; NoVLX-NEXT: .Lcfi1315: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1106: +; NoVLX-NEXT: .Lcfi1316: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36202,8 +37042,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36244,12 +37084,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1107: +; NoVLX-NEXT: .Lcfi1317: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1108: +; NoVLX-NEXT: .Lcfi1318: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1109: +; NoVLX-NEXT: .Lcfi1319: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36257,29 +37097,29 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -36490,18 +37330,16 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1110: +; NoVLX-NEXT: .Lcfi1320: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1111: +; NoVLX-NEXT: .Lcfi1321: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1112: +; NoVLX-NEXT: .Lcfi1322: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -36535,6 +37373,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -36563,18 +37403,16 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1113: +; NoVLX-NEXT: .Lcfi1323: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1114: +; NoVLX-NEXT: .Lcfi1324: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1115: +; NoVLX-NEXT: .Lcfi1325: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -36608,6 +37446,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -36638,19 +37478,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1116: +; NoVLX-NEXT: .Lcfi1326: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1117: +; NoVLX-NEXT: .Lcfi1327: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1118: +; NoVLX-NEXT: .Lcfi1328: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -36684,6 +37522,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -36715,19 +37555,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1119: +; NoVLX-NEXT: .Lcfi1329: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1120: +; NoVLX-NEXT: .Lcfi1330: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1121: +; NoVLX-NEXT: .Lcfi1331: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -36761,6 +37599,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -36794,19 +37634,17 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1122: +; NoVLX-NEXT: .Lcfi1332: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1123: +; NoVLX-NEXT: .Lcfi1333: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1124: +; NoVLX-NEXT: .Lcfi1334: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -36840,6 +37678,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -36872,20 +37712,18 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1125: +; NoVLX-NEXT: .Lcfi1335: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1126: +; NoVLX-NEXT: .Lcfi1336: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1127: +; NoVLX-NEXT: .Lcfi1337: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -36919,6 +37757,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -36952,53 +37792,53 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1128: +; NoVLX-NEXT: .Lcfi1338: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1129: +; NoVLX-NEXT: .Lcfi1339: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1130: +; NoVLX-NEXT: .Lcfi1340: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37030,53 +37870,53 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1131: +; NoVLX-NEXT: .Lcfi1341: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1132: +; NoVLX-NEXT: .Lcfi1342: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1133: +; NoVLX-NEXT: .Lcfi1343: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37110,54 +37950,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1134: +; NoVLX-NEXT: .Lcfi1344: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1135: +; NoVLX-NEXT: .Lcfi1345: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1136: +; NoVLX-NEXT: .Lcfi1346: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37192,54 +38032,54 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1137: +; NoVLX-NEXT: .Lcfi1347: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1138: +; NoVLX-NEXT: .Lcfi1348: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1139: +; NoVLX-NEXT: .Lcfi1349: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37276,54 +38116,54 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1140: +; NoVLX-NEXT: .Lcfi1350: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1141: +; NoVLX-NEXT: .Lcfi1351: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1142: +; NoVLX-NEXT: .Lcfi1352: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37359,55 +38199,55 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1143: +; NoVLX-NEXT: .Lcfi1353: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1144: +; NoVLX-NEXT: .Lcfi1354: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1145: +; NoVLX-NEXT: .Lcfi1355: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -37443,15 +38283,30 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1146: +; NoVLX-NEXT: .Lcfi1356: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1147: +; NoVLX-NEXT: .Lcfi1357: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1148: +; NoVLX-NEXT: .Lcfi1358: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1359: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1360: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1361: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1362: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1363: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 @@ -37459,77 +38314,82 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -37551,15 +38411,30 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1149: +; NoVLX-NEXT: .Lcfi1364: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1150: +; NoVLX-NEXT: .Lcfi1365: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1151: +; NoVLX-NEXT: .Lcfi1366: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1367: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1368: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1369: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1370: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1371: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 @@ -37567,77 +38442,82 @@ ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -37661,15 +38541,30 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1152: +; NoVLX-NEXT: .Lcfi1372: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1153: +; NoVLX-NEXT: .Lcfi1373: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1154: +; NoVLX-NEXT: .Lcfi1374: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1375: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1376: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1377: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1378: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1379: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 @@ -37678,77 +38573,82 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -37773,125 +38673,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1155: -; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1156: -; NoVLX-NEXT: .cfi_offset %rbp, -16 -; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1157: -; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kmovw %k0, (%rsp) -; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp -; NoVLX-NEXT: popq %rbp -; NoVLX-NEXT: retq -entry: - %0 = bitcast <2 x i64> %__a to <16 x i8> - %load = load <2 x i64>, <2 x i64>* %__b - %1 = bitcast <2 x i64> %load to <16 x i8> - %2 = icmp ult <16 x i8> %0, %1 - %3 = bitcast i16 %__u to <16 x i1> - %4 = and <16 x i1> %2, %3 - %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> - %6 = bitcast <32 x i1> %5 to i32 - ret i32 %6 -} - - -define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: -; VLX: # BB#0: # %entry -; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 -; VLX-NEXT: kmovq %k0, %rax -; VLX-NEXT: retq -; -; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: -; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1158: +; NoVLX-NEXT: .Lcfi1380: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1159: +; NoVLX-NEXT: .Lcfi1381: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1160: +; NoVLX-NEXT: .Lcfi1382: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -37899,24 +38686,25 @@ ; NoVLX-NEXT: pushq %r12 ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1161: +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1383: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1162: +; NoVLX-NEXT: .Lcfi1384: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1163: +; NoVLX-NEXT: .Lcfi1385: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1164: +; NoVLX-NEXT: .Lcfi1386: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1165: +; NoVLX-NEXT: .Lcfi1387: ; NoVLX-NEXT: .cfi_offset %r15, -24 -; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -37959,11 +38747,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -37975,15 +38763,147 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__a to <16 x i8> + %load = load <2 x i64>, <2 x i64>* %__b + %1 = bitcast <2 x i64> %load to <16 x i8> + %2 = icmp ult <16 x i8> %0, %1 + %3 = bitcast i16 %__u to <16 x i1> + %4 = and <16 x i1> %2, %3 + %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> + %6 = bitcast <32 x i1> %5 to i32 + ret i32 %6 +} + + +define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { +; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1388: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1389: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1390: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1391: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1392: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1393: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1394: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1395: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38019,12 +38939,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1166: +; NoVLX-NEXT: .Lcfi1396: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1167: +; NoVLX-NEXT: .Lcfi1397: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1168: +; NoVLX-NEXT: .Lcfi1398: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38033,23 +38953,27 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1169: +; NoVLX-NEXT: .Lcfi1399: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1170: +; NoVLX-NEXT: .Lcfi1400: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1171: +; NoVLX-NEXT: .Lcfi1401: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1172: +; NoVLX-NEXT: .Lcfi1402: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1173: +; NoVLX-NEXT: .Lcfi1403: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38092,11 +39016,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -38108,15 +39032,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38154,12 +39074,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1174: +; NoVLX-NEXT: .Lcfi1404: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1175: +; NoVLX-NEXT: .Lcfi1405: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1176: +; NoVLX-NEXT: .Lcfi1406: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38168,15 +39088,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1177: +; NoVLX-NEXT: .Lcfi1407: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1178: +; NoVLX-NEXT: .Lcfi1408: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1179: +; NoVLX-NEXT: .Lcfi1409: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1180: +; NoVLX-NEXT: .Lcfi1410: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1181: +; NoVLX-NEXT: .Lcfi1411: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -38186,6 +39106,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38228,11 +39152,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -38244,15 +39168,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38291,12 +39211,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1182: +; NoVLX-NEXT: .Lcfi1412: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1183: +; NoVLX-NEXT: .Lcfi1413: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1184: +; NoVLX-NEXT: .Lcfi1414: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38305,24 +39225,28 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1185: +; NoVLX-NEXT: .Lcfi1415: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1186: +; NoVLX-NEXT: .Lcfi1416: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1187: +; NoVLX-NEXT: .Lcfi1417: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1188: +; NoVLX-NEXT: .Lcfi1418: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1189: +; NoVLX-NEXT: .Lcfi1419: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38365,11 +39289,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -38381,15 +39305,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38430,12 +39350,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1190: +; NoVLX-NEXT: .Lcfi1420: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1191: +; NoVLX-NEXT: .Lcfi1421: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1192: +; NoVLX-NEXT: .Lcfi1422: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -38482,12 +39402,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1193: +; NoVLX-NEXT: .Lcfi1423: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1194: +; NoVLX-NEXT: .Lcfi1424: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1195: +; NoVLX-NEXT: .Lcfi1425: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -38536,12 +39456,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1196: +; NoVLX-NEXT: .Lcfi1426: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1197: +; NoVLX-NEXT: .Lcfi1427: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1198: +; NoVLX-NEXT: .Lcfi1428: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -38600,12 +39520,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1199: +; NoVLX-NEXT: .Lcfi1429: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1200: +; NoVLX-NEXT: .Lcfi1430: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1201: +; NoVLX-NEXT: .Lcfi1431: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -38799,12 +39719,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1202: +; NoVLX-NEXT: .Lcfi1432: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1203: +; NoVLX-NEXT: .Lcfi1433: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1204: +; NoVLX-NEXT: .Lcfi1434: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -38815,8 +39735,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38850,6 +39768,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38877,12 +39797,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1205: +; NoVLX-NEXT: .Lcfi1435: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1206: +; NoVLX-NEXT: .Lcfi1436: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1207: +; NoVLX-NEXT: .Lcfi1437: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -38893,8 +39813,6 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -38928,6 +39846,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -38957,12 +39877,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1208: +; NoVLX-NEXT: .Lcfi1438: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1209: +; NoVLX-NEXT: .Lcfi1439: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1210: +; NoVLX-NEXT: .Lcfi1440: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -38974,8 +39894,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -39009,6 +39927,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39039,12 +39959,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1211: +; NoVLX-NEXT: .Lcfi1441: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1212: +; NoVLX-NEXT: .Lcfi1442: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1213: +; NoVLX-NEXT: .Lcfi1443: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -39056,8 +39976,6 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -39091,6 +40009,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39122,12 +40042,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1214: +; NoVLX-NEXT: .Lcfi1444: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1215: +; NoVLX-NEXT: .Lcfi1445: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1216: +; NoVLX-NEXT: .Lcfi1446: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39138,43 +40058,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39205,12 +40125,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1217: +; NoVLX-NEXT: .Lcfi1447: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1218: +; NoVLX-NEXT: .Lcfi1448: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1219: +; NoVLX-NEXT: .Lcfi1449: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39221,43 +40141,43 @@ ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39290,12 +40210,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1220: +; NoVLX-NEXT: .Lcfi1450: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1221: +; NoVLX-NEXT: .Lcfi1451: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1222: +; NoVLX-NEXT: .Lcfi1452: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39307,43 +40227,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39377,12 +40297,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1223: +; NoVLX-NEXT: .Lcfi1453: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1224: +; NoVLX-NEXT: .Lcfi1454: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1225: +; NoVLX-NEXT: .Lcfi1455: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39394,43 +40314,43 @@ ; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -39466,15 +40386,30 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1226: +; NoVLX-NEXT: .Lcfi1456: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1227: +; NoVLX-NEXT: .Lcfi1457: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1228: +; NoVLX-NEXT: .Lcfi1458: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1459: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1460: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1461: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1462: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1463: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 @@ -39482,77 +40417,82 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -39575,15 +40515,30 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1229: +; NoVLX-NEXT: .Lcfi1464: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1230: +; NoVLX-NEXT: .Lcfi1465: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1231: +; NoVLX-NEXT: .Lcfi1466: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1467: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1468: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1469: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1470: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1471: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 @@ -39591,77 +40546,82 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -39686,15 +40646,30 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1232: +; NoVLX-NEXT: .Lcfi1472: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1233: +; NoVLX-NEXT: .Lcfi1473: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1234: +; NoVLX-NEXT: .Lcfi1474: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1475: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1476: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1477: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1478: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1479: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 @@ -39703,77 +40678,82 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -39799,15 +40779,30 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1235: +; NoVLX-NEXT: .Lcfi1480: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1236: +; NoVLX-NEXT: .Lcfi1481: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1237: +; NoVLX-NEXT: .Lcfi1482: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1483: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1484: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1485: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1486: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1487: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 @@ -39816,77 +40811,82 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -39913,12 +40913,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1238: +; NoVLX-NEXT: .Lcfi1488: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1239: +; NoVLX-NEXT: .Lcfi1489: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1240: +; NoVLX-NEXT: .Lcfi1490: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -39927,15 +40927,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1241: +; NoVLX-NEXT: .Lcfi1491: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1242: +; NoVLX-NEXT: .Lcfi1492: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1243: +; NoVLX-NEXT: .Lcfi1493: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1244: +; NoVLX-NEXT: .Lcfi1494: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1245: +; NoVLX-NEXT: .Lcfi1495: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -39944,6 +40944,10 @@ ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -39986,11 +40990,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40002,15 +41006,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40047,12 +41047,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1246: +; NoVLX-NEXT: .Lcfi1496: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1247: +; NoVLX-NEXT: .Lcfi1497: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1248: +; NoVLX-NEXT: .Lcfi1498: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40061,23 +41061,27 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1249: +; NoVLX-NEXT: .Lcfi1499: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1250: +; NoVLX-NEXT: .Lcfi1500: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1251: +; NoVLX-NEXT: .Lcfi1501: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1252: +; NoVLX-NEXT: .Lcfi1502: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1253: +; NoVLX-NEXT: .Lcfi1503: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -40120,11 +41124,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40136,15 +41140,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40183,12 +41183,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1254: +; NoVLX-NEXT: .Lcfi1504: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1255: +; NoVLX-NEXT: .Lcfi1505: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1256: +; NoVLX-NEXT: .Lcfi1506: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40197,15 +41197,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1257: +; NoVLX-NEXT: .Lcfi1507: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1258: +; NoVLX-NEXT: .Lcfi1508: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1259: +; NoVLX-NEXT: .Lcfi1509: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1260: +; NoVLX-NEXT: .Lcfi1510: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1261: +; NoVLX-NEXT: .Lcfi1511: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -40215,6 +41215,10 @@ ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -40257,11 +41261,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40273,15 +41277,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40321,12 +41321,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1262: +; NoVLX-NEXT: .Lcfi1512: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1263: +; NoVLX-NEXT: .Lcfi1513: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1264: +; NoVLX-NEXT: .Lcfi1514: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40335,24 +41335,28 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1265: +; NoVLX-NEXT: .Lcfi1515: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1266: +; NoVLX-NEXT: .Lcfi1516: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1267: +; NoVLX-NEXT: .Lcfi1517: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1268: +; NoVLX-NEXT: .Lcfi1518: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1269: +; NoVLX-NEXT: .Lcfi1519: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -40395,11 +41399,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -40411,15 +41415,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40460,12 +41460,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1270: +; NoVLX-NEXT: .Lcfi1520: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1271: +; NoVLX-NEXT: .Lcfi1521: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1272: +; NoVLX-NEXT: .Lcfi1522: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -40490,7 +41490,7 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm3 @@ -40511,7 +41511,7 @@ ; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 ; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm5 @@ -40569,78 +41569,80 @@ ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm6 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 ; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm8 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rax +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rax ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vmovd %eax, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm2 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm1, %ymm4, %ymm3 -; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor %ymm3, %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -40704,18 +41706,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 ; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm1, %ymm3, %ymm2 -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40782,6 +41780,8 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -40816,12 +41816,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1273: +; NoVLX-NEXT: .Lcfi1523: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1274: +; NoVLX-NEXT: .Lcfi1524: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1275: +; NoVLX-NEXT: .Lcfi1525: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -40832,161 +41832,161 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rdx +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 -; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %edx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: movq %rdx, %rcx -; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm0, %rcx -; NoVLX-NEXT: movl %ecx, %edx -; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %eax, %edx -; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx -; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %ecx, %edx -; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: movq %rcx, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rdx +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 -; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rdx, %rcx +; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 ; NoVLX-NEXT: vmovq %xmm0, %rcx -; NoVLX-NEXT: shrq $48, %rdx -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: movl %ecx, %edx -; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: movq %rcx, %rdx -; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rdx +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 -; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: movq %rdx, %rcx +; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: shrq $48, %rdx -; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm0, %ymm3, %ymm3 -; NoVLX-NEXT: vpxor 32(%rdi), %ymm0, %ymm4 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm0, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -41089,12 +42089,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1276: +; NoVLX-NEXT: .Lcfi1526: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1277: +; NoVLX-NEXT: .Lcfi1527: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1278: +; NoVLX-NEXT: .Lcfi1528: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -41118,61 +42118,61 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm5 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -41180,8 +42180,8 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax @@ -41191,159 +42191,92 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 -; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm7 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rax ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vmovd %eax, %xmm6 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 -; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm7 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm3, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm2 +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm2 +; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm3 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm5 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor %ymm4, %ymm5, %ymm5 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm5, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 -; NoVLX-NEXT: kshiftlw $14, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: kshiftlw $15, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 -; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -41402,24 +42335,91 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm2 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm2, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -41457,12 +42457,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1279: +; NoVLX-NEXT: .Lcfi1529: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1280: +; NoVLX-NEXT: .Lcfi1530: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1281: +; NoVLX-NEXT: .Lcfi1531: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -41486,223 +42486,223 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm6 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm1 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor (%rsi), %ymm3, %ymm4 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm4, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm5, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpxor 32(%rsi), %ymm3, %ymm4 -; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -41746,8 +42746,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -41799,13 +42799,13 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -41862,28 +42862,28 @@ ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -41942,28 +42942,28 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -42026,8 +43026,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -42086,28 +43086,28 @@ ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -42582,12 +43582,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1282: +; NoVLX-NEXT: .Lcfi1532: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1283: +; NoVLX-NEXT: .Lcfi1533: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1284: +; NoVLX-NEXT: .Lcfi1534: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42628,19 +43628,19 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1285: +; NoVLX-NEXT: .Lcfi1535: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1286: +; NoVLX-NEXT: .Lcfi1536: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1287: +; NoVLX-NEXT: .Lcfi1537: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42676,12 +43676,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1288: +; NoVLX-NEXT: .Lcfi1538: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1289: +; NoVLX-NEXT: .Lcfi1539: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1290: +; NoVLX-NEXT: .Lcfi1540: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42689,24 +43689,24 @@ ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42744,12 +43744,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1291: +; NoVLX-NEXT: .Lcfi1541: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1292: +; NoVLX-NEXT: .Lcfi1542: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1293: +; NoVLX-NEXT: .Lcfi1543: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42757,24 +43757,24 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42813,12 +43813,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1294: +; NoVLX-NEXT: .Lcfi1544: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1295: +; NoVLX-NEXT: .Lcfi1545: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1296: +; NoVLX-NEXT: .Lcfi1546: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42863,12 +43863,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1297: +; NoVLX-NEXT: .Lcfi1547: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1298: +; NoVLX-NEXT: .Lcfi1548: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1299: +; NoVLX-NEXT: .Lcfi1549: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -42877,24 +43877,24 @@ ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm2 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -42934,12 +43934,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1300: +; NoVLX-NEXT: .Lcfi1550: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1301: +; NoVLX-NEXT: .Lcfi1551: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1302: +; NoVLX-NEXT: .Lcfi1552: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -42949,8 +43949,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -42986,12 +43986,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1303: +; NoVLX-NEXT: .Lcfi1553: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1304: +; NoVLX-NEXT: .Lcfi1554: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1305: +; NoVLX-NEXT: .Lcfi1555: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43001,8 +44001,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43040,41 +44040,41 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1306: +; NoVLX-NEXT: .Lcfi1556: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1307: +; NoVLX-NEXT: .Lcfi1557: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1308: +; NoVLX-NEXT: .Lcfi1558: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43114,41 +44114,41 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1309: +; NoVLX-NEXT: .Lcfi1559: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1310: +; NoVLX-NEXT: .Lcfi1560: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1311: +; NoVLX-NEXT: .Lcfi1561: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43189,12 +44189,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1312: +; NoVLX-NEXT: .Lcfi1562: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1313: +; NoVLX-NEXT: .Lcfi1563: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1314: +; NoVLX-NEXT: .Lcfi1564: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43205,8 +44205,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43245,12 +44245,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1315: +; NoVLX-NEXT: .Lcfi1565: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1316: +; NoVLX-NEXT: .Lcfi1566: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1317: +; NoVLX-NEXT: .Lcfi1567: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43258,29 +44258,29 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -43511,20 +44511,18 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1318: +; NoVLX-NEXT: .Lcfi1568: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1319: +; NoVLX-NEXT: .Lcfi1569: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1320: +; NoVLX-NEXT: .Lcfi1570: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -43558,6 +44556,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -43586,20 +44586,18 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1321: +; NoVLX-NEXT: .Lcfi1571: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1322: +; NoVLX-NEXT: .Lcfi1572: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1323: +; NoVLX-NEXT: .Lcfi1573: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -43633,6 +44631,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -43663,12 +44663,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1324: +; NoVLX-NEXT: .Lcfi1574: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1325: +; NoVLX-NEXT: .Lcfi1575: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1326: +; NoVLX-NEXT: .Lcfi1576: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43677,8 +44677,6 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -43712,6 +44710,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -43743,12 +44743,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1327: +; NoVLX-NEXT: .Lcfi1577: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1328: +; NoVLX-NEXT: .Lcfi1578: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1329: +; NoVLX-NEXT: .Lcfi1579: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43757,8 +44757,6 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -43792,6 +44790,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -43824,20 +44824,18 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1330: +; NoVLX-NEXT: .Lcfi1580: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1331: +; NoVLX-NEXT: .Lcfi1581: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1332: +; NoVLX-NEXT: .Lcfi1582: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -43871,6 +44869,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -43902,12 +44902,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1333: +; NoVLX-NEXT: .Lcfi1583: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1334: +; NoVLX-NEXT: .Lcfi1584: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1335: +; NoVLX-NEXT: .Lcfi1585: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43916,8 +44916,6 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -43951,6 +44949,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -43984,55 +44984,55 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1336: +; NoVLX-NEXT: .Lcfi1586: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1337: +; NoVLX-NEXT: .Lcfi1587: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1338: +; NoVLX-NEXT: .Lcfi1588: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44064,55 +45064,55 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1339: +; NoVLX-NEXT: .Lcfi1589: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1340: +; NoVLX-NEXT: .Lcfi1590: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1341: +; NoVLX-NEXT: .Lcfi1591: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44146,12 +45146,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1342: +; NoVLX-NEXT: .Lcfi1592: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1343: +; NoVLX-NEXT: .Lcfi1593: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1344: +; NoVLX-NEXT: .Lcfi1594: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44160,43 +45160,43 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44231,12 +45231,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1345: +; NoVLX-NEXT: .Lcfi1595: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1346: +; NoVLX-NEXT: .Lcfi1596: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1347: +; NoVLX-NEXT: .Lcfi1597: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44245,43 +45245,43 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44317,55 +45317,55 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1348: +; NoVLX-NEXT: .Lcfi1598: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1349: +; NoVLX-NEXT: .Lcfi1599: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1350: +; NoVLX-NEXT: .Lcfi1600: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44400,12 +45400,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1351: +; NoVLX-NEXT: .Lcfi1601: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1352: +; NoVLX-NEXT: .Lcfi1602: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1353: +; NoVLX-NEXT: .Lcfi1603: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44414,43 +45414,43 @@ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -44487,87 +45487,107 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1354: +; NoVLX-NEXT: .Lcfi1604: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1355: +; NoVLX-NEXT: .Lcfi1605: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1356: +; NoVLX-NEXT: .Lcfi1606: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1607: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1608: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1609: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1610: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1611: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -44590,87 +45610,107 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1357: +; NoVLX-NEXT: .Lcfi1612: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1358: +; NoVLX-NEXT: .Lcfi1613: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1359: +; NoVLX-NEXT: .Lcfi1614: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1615: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1616: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1617: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1618: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1619: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -44695,88 +45735,108 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1360: +; NoVLX-NEXT: .Lcfi1620: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1361: +; NoVLX-NEXT: .Lcfi1621: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1362: +; NoVLX-NEXT: .Lcfi1622: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1623: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1624: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1625: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1626: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1627: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -44802,88 +45862,108 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1363: +; NoVLX-NEXT: .Lcfi1628: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1364: +; NoVLX-NEXT: .Lcfi1629: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1365: +; NoVLX-NEXT: .Lcfi1630: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1631: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1632: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1633: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1634: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1635: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -44910,87 +45990,107 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1366: +; NoVLX-NEXT: .Lcfi1636: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1367: +; NoVLX-NEXT: .Lcfi1637: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1368: +; NoVLX-NEXT: .Lcfi1638: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1639: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1640: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1641: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1642: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1643: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -45016,88 +46116,108 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1369: +; NoVLX-NEXT: .Lcfi1644: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1370: +; NoVLX-NEXT: .Lcfi1645: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1371: +; NoVLX-NEXT: .Lcfi1646: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1647: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1648: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1649: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1650: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1651: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -45125,12 +46245,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1372: +; NoVLX-NEXT: .Lcfi1652: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1373: +; NoVLX-NEXT: .Lcfi1653: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1374: +; NoVLX-NEXT: .Lcfi1654: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45139,17 +46259,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1375: +; NoVLX-NEXT: .Lcfi1655: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1376: +; NoVLX-NEXT: .Lcfi1656: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1377: +; NoVLX-NEXT: .Lcfi1657: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1378: +; NoVLX-NEXT: .Lcfi1658: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1379: +; NoVLX-NEXT: .Lcfi1659: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45192,11 +46316,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45208,15 +46332,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45253,12 +46373,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1380: +; NoVLX-NEXT: .Lcfi1660: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1381: +; NoVLX-NEXT: .Lcfi1661: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1382: +; NoVLX-NEXT: .Lcfi1662: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45267,17 +46387,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1383: +; NoVLX-NEXT: .Lcfi1663: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1384: +; NoVLX-NEXT: .Lcfi1664: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1385: +; NoVLX-NEXT: .Lcfi1665: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1386: +; NoVLX-NEXT: .Lcfi1666: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1387: +; NoVLX-NEXT: .Lcfi1667: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45320,11 +46444,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45336,15 +46460,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45383,12 +46503,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1388: +; NoVLX-NEXT: .Lcfi1668: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1389: +; NoVLX-NEXT: .Lcfi1669: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1390: +; NoVLX-NEXT: .Lcfi1670: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45397,18 +46517,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1391: +; NoVLX-NEXT: .Lcfi1671: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1392: +; NoVLX-NEXT: .Lcfi1672: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1393: +; NoVLX-NEXT: .Lcfi1673: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1394: +; NoVLX-NEXT: .Lcfi1674: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1395: +; NoVLX-NEXT: .Lcfi1675: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45451,11 +46575,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45467,15 +46591,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45515,12 +46635,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1396: +; NoVLX-NEXT: .Lcfi1676: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1397: +; NoVLX-NEXT: .Lcfi1677: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1398: +; NoVLX-NEXT: .Lcfi1678: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45529,18 +46649,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1399: +; NoVLX-NEXT: .Lcfi1679: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1400: +; NoVLX-NEXT: .Lcfi1680: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1401: +; NoVLX-NEXT: .Lcfi1681: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1402: +; NoVLX-NEXT: .Lcfi1682: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1403: +; NoVLX-NEXT: .Lcfi1683: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45583,11 +46707,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45599,15 +46723,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45648,12 +46768,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1404: +; NoVLX-NEXT: .Lcfi1684: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1405: +; NoVLX-NEXT: .Lcfi1685: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1406: +; NoVLX-NEXT: .Lcfi1686: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45662,17 +46782,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1407: +; NoVLX-NEXT: .Lcfi1687: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1408: +; NoVLX-NEXT: .Lcfi1688: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1409: +; NoVLX-NEXT: .Lcfi1689: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1410: +; NoVLX-NEXT: .Lcfi1690: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1411: +; NoVLX-NEXT: .Lcfi1691: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45715,11 +46839,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45731,15 +46855,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -45779,12 +46899,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1412: +; NoVLX-NEXT: .Lcfi1692: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1413: +; NoVLX-NEXT: .Lcfi1693: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1414: +; NoVLX-NEXT: .Lcfi1694: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45793,18 +46913,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1415: +; NoVLX-NEXT: .Lcfi1695: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1416: +; NoVLX-NEXT: .Lcfi1696: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1417: +; NoVLX-NEXT: .Lcfi1697: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1418: +; NoVLX-NEXT: .Lcfi1698: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1419: +; NoVLX-NEXT: .Lcfi1699: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -45847,11 +46971,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -45863,15 +46987,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -46187,9 +47307,9 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -46233,7 +47353,6 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46241,9 +47360,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -46287,9 +47407,8 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46297,9 +47416,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -46393,7 +47513,6 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46401,9 +47520,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -46746,12 +47866,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1420: +; NoVLX-NEXT: .Lcfi1700: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1421: +; NoVLX-NEXT: .Lcfi1701: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1422: +; NoVLX-NEXT: .Lcfi1702: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -46792,19 +47912,19 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1423: +; NoVLX-NEXT: .Lcfi1703: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1424: +; NoVLX-NEXT: .Lcfi1704: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1425: +; NoVLX-NEXT: .Lcfi1705: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -46840,18 +47960,19 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1426: +; NoVLX-NEXT: .Lcfi1706: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1427: +; NoVLX-NEXT: .Lcfi1707: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1428: +; NoVLX-NEXT: .Lcfi1708: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46859,10 +47980,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -46900,18 +48020,19 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1429: +; NoVLX-NEXT: .Lcfi1709: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1430: +; NoVLX-NEXT: .Lcfi1710: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1431: +; NoVLX-NEXT: .Lcfi1711: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -46919,10 +48040,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -46961,12 +48081,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1432: +; NoVLX-NEXT: .Lcfi1712: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1433: +; NoVLX-NEXT: .Lcfi1713: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1434: +; NoVLX-NEXT: .Lcfi1714: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -47011,12 +48131,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1435: +; NoVLX-NEXT: .Lcfi1715: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1436: +; NoVLX-NEXT: .Lcfi1716: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1437: +; NoVLX-NEXT: .Lcfi1717: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -47024,6 +48144,7 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47031,10 +48152,9 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -47074,12 +48194,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1438: +; NoVLX-NEXT: .Lcfi1718: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1439: +; NoVLX-NEXT: .Lcfi1719: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1440: +; NoVLX-NEXT: .Lcfi1720: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47089,8 +48209,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47126,12 +48246,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1441: +; NoVLX-NEXT: .Lcfi1721: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1442: +; NoVLX-NEXT: .Lcfi1722: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1443: +; NoVLX-NEXT: .Lcfi1723: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47141,8 +48261,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47180,12 +48300,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1444: +; NoVLX-NEXT: .Lcfi1724: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1445: +; NoVLX-NEXT: .Lcfi1725: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1446: +; NoVLX-NEXT: .Lcfi1726: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47205,8 +48325,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47246,12 +48366,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1447: +; NoVLX-NEXT: .Lcfi1727: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1448: +; NoVLX-NEXT: .Lcfi1728: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1449: +; NoVLX-NEXT: .Lcfi1729: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47271,8 +48391,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47313,12 +48433,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1450: +; NoVLX-NEXT: .Lcfi1730: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1451: +; NoVLX-NEXT: .Lcfi1731: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1452: +; NoVLX-NEXT: .Lcfi1732: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47329,8 +48449,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47369,12 +48489,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1453: +; NoVLX-NEXT: .Lcfi1733: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1454: +; NoVLX-NEXT: .Lcfi1734: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1455: +; NoVLX-NEXT: .Lcfi1735: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -47395,8 +48515,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -47446,8 +48566,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47506,8 +48626,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47566,28 +48686,28 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47644,32 +48764,32 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47734,8 +48854,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47796,28 +48916,28 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -48305,12 +49425,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1456: +; NoVLX-NEXT: .Lcfi1736: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1457: +; NoVLX-NEXT: .Lcfi1737: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1458: +; NoVLX-NEXT: .Lcfi1738: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48353,12 +49473,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1459: +; NoVLX-NEXT: .Lcfi1739: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1460: +; NoVLX-NEXT: .Lcfi1740: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1461: +; NoVLX-NEXT: .Lcfi1741: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48403,12 +49523,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1462: +; NoVLX-NEXT: .Lcfi1742: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1463: +; NoVLX-NEXT: .Lcfi1743: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1464: +; NoVLX-NEXT: .Lcfi1744: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48417,23 +49537,23 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -48473,37 +49593,37 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1465: +; NoVLX-NEXT: .Lcfi1745: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1466: +; NoVLX-NEXT: .Lcfi1746: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1467: +; NoVLX-NEXT: .Lcfi1747: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -48544,12 +49664,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1468: +; NoVLX-NEXT: .Lcfi1748: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1469: +; NoVLX-NEXT: .Lcfi1749: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1470: +; NoVLX-NEXT: .Lcfi1750: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48596,12 +49716,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1471: +; NoVLX-NEXT: .Lcfi1751: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1472: +; NoVLX-NEXT: .Lcfi1752: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1473: +; NoVLX-NEXT: .Lcfi1753: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48611,23 +49731,23 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -48669,12 +49789,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1474: +; NoVLX-NEXT: .Lcfi1754: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1475: +; NoVLX-NEXT: .Lcfi1755: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1476: +; NoVLX-NEXT: .Lcfi1756: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48685,8 +49805,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48723,12 +49843,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1477: +; NoVLX-NEXT: .Lcfi1757: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1478: +; NoVLX-NEXT: .Lcfi1758: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1479: +; NoVLX-NEXT: .Lcfi1759: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48739,8 +49859,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48779,12 +49899,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1480: +; NoVLX-NEXT: .Lcfi1760: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1481: +; NoVLX-NEXT: .Lcfi1761: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1482: +; NoVLX-NEXT: .Lcfi1762: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48792,29 +49912,29 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48855,12 +49975,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1483: +; NoVLX-NEXT: .Lcfi1763: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1484: +; NoVLX-NEXT: .Lcfi1764: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1485: +; NoVLX-NEXT: .Lcfi1765: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48868,29 +49988,29 @@ ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48932,12 +50052,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1486: +; NoVLX-NEXT: .Lcfi1766: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1487: +; NoVLX-NEXT: .Lcfi1767: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1488: +; NoVLX-NEXT: .Lcfi1768: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48949,8 +50069,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -48990,12 +50110,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1489: +; NoVLX-NEXT: .Lcfi1769: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1490: +; NoVLX-NEXT: .Lcfi1770: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1491: +; NoVLX-NEXT: .Lcfi1771: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -49004,29 +50124,29 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 -; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $15, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 -; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k3 ; NoVLX-NEXT: kshiftrw $15, %k3, %k3 -; NoVLX-NEXT: kshiftlw $14, %k0, %k0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: kmovw %k3, %eax +; NoVLX-NEXT: kmovw %k2, %ecx ; NoVLX-NEXT: vmovd %ecx, %xmm1 ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -49233,18 +50353,16 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1492: +; NoVLX-NEXT: .Lcfi1772: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1493: +; NoVLX-NEXT: .Lcfi1773: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1494: +; NoVLX-NEXT: .Lcfi1774: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -49278,6 +50396,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49306,18 +50426,16 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1495: +; NoVLX-NEXT: .Lcfi1775: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1496: +; NoVLX-NEXT: .Lcfi1776: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1497: +; NoVLX-NEXT: .Lcfi1777: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -49351,6 +50469,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49381,19 +50501,17 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1498: +; NoVLX-NEXT: .Lcfi1778: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1499: +; NoVLX-NEXT: .Lcfi1779: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1500: +; NoVLX-NEXT: .Lcfi1780: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -49427,6 +50545,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49458,19 +50578,17 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1501: +; NoVLX-NEXT: .Lcfi1781: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1502: +; NoVLX-NEXT: .Lcfi1782: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1503: +; NoVLX-NEXT: .Lcfi1783: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -49504,6 +50622,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49536,18 +50656,16 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1504: +; NoVLX-NEXT: .Lcfi1784: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1505: +; NoVLX-NEXT: .Lcfi1785: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1506: +; NoVLX-NEXT: .Lcfi1786: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -49581,6 +50699,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49612,19 +50732,17 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1507: +; NoVLX-NEXT: .Lcfi1787: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1508: +; NoVLX-NEXT: .Lcfi1788: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1509: +; NoVLX-NEXT: .Lcfi1789: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -49658,6 +50776,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49691,53 +50811,53 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1510: +; NoVLX-NEXT: .Lcfi1790: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1511: +; NoVLX-NEXT: .Lcfi1791: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1512: +; NoVLX-NEXT: .Lcfi1792: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49769,53 +50889,53 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1513: +; NoVLX-NEXT: .Lcfi1793: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1514: +; NoVLX-NEXT: .Lcfi1794: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1515: +; NoVLX-NEXT: .Lcfi1795: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49849,54 +50969,54 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1516: +; NoVLX-NEXT: .Lcfi1796: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1517: +; NoVLX-NEXT: .Lcfi1797: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1518: +; NoVLX-NEXT: .Lcfi1798: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -49931,54 +51051,54 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1519: +; NoVLX-NEXT: .Lcfi1799: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1520: +; NoVLX-NEXT: .Lcfi1800: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1521: +; NoVLX-NEXT: .Lcfi1801: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -50014,53 +51134,53 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1522: +; NoVLX-NEXT: .Lcfi1802: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1523: +; NoVLX-NEXT: .Lcfi1803: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1524: +; NoVLX-NEXT: .Lcfi1804: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -50095,54 +51215,54 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1525: +; NoVLX-NEXT: .Lcfi1805: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1526: +; NoVLX-NEXT: .Lcfi1806: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1527: +; NoVLX-NEXT: .Lcfi1807: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -50182,8 +51302,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -50237,8 +51357,8 @@ ; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -50294,8 +51414,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -50359,8 +51479,8 @@ ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -50419,13 +51539,13 @@ ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -50491,8 +51611,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -50913,12 +52033,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1528: +; NoVLX-NEXT: .Lcfi1808: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1529: +; NoVLX-NEXT: .Lcfi1809: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1530: +; NoVLX-NEXT: .Lcfi1810: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50956,12 +52076,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1531: +; NoVLX-NEXT: .Lcfi1811: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1532: +; NoVLX-NEXT: .Lcfi1812: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1533: +; NoVLX-NEXT: .Lcfi1813: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -51000,12 +52120,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1534: +; NoVLX-NEXT: .Lcfi1814: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1535: +; NoVLX-NEXT: .Lcfi1815: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1536: +; NoVLX-NEXT: .Lcfi1816: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -51048,12 +52168,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1537: +; NoVLX-NEXT: .Lcfi1817: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1538: +; NoVLX-NEXT: .Lcfi1818: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1539: +; NoVLX-NEXT: .Lcfi1819: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51101,12 +52221,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1540: +; NoVLX-NEXT: .Lcfi1820: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1541: +; NoVLX-NEXT: .Lcfi1821: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1542: +; NoVLX-NEXT: .Lcfi1822: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51114,8 +52234,8 @@ ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 +; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -51155,12 +52275,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1543: +; NoVLX-NEXT: .Lcfi1823: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1544: +; NoVLX-NEXT: .Lcfi1824: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1545: +; NoVLX-NEXT: .Lcfi1825: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51211,20 +52331,20 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1546: +; NoVLX-NEXT: .Lcfi1826: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1547: +; NoVLX-NEXT: .Lcfi1827: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1548: +; NoVLX-NEXT: .Lcfi1828: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51260,20 +52380,20 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1549: +; NoVLX-NEXT: .Lcfi1829: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1550: +; NoVLX-NEXT: .Lcfi1830: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1551: +; NoVLX-NEXT: .Lcfi1831: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51310,12 +52430,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1552: +; NoVLX-NEXT: .Lcfi1832: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1553: +; NoVLX-NEXT: .Lcfi1833: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1554: +; NoVLX-NEXT: .Lcfi1834: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51323,8 +52443,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51364,12 +52484,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1555: +; NoVLX-NEXT: .Lcfi1835: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1556: +; NoVLX-NEXT: .Lcfi1836: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1557: +; NoVLX-NEXT: .Lcfi1837: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -51382,8 +52502,8 @@ ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51423,12 +52543,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1558: +; NoVLX-NEXT: .Lcfi1838: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1559: +; NoVLX-NEXT: .Lcfi1839: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1560: +; NoVLX-NEXT: .Lcfi1840: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -51441,8 +52561,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51483,12 +52603,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1561: +; NoVLX-NEXT: .Lcfi1841: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1562: +; NoVLX-NEXT: .Lcfi1842: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1563: +; NoVLX-NEXT: .Lcfi1843: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -51502,8 +52622,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -51734,20 +52854,18 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1564: +; NoVLX-NEXT: .Lcfi1844: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1565: +; NoVLX-NEXT: .Lcfi1845: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1566: +; NoVLX-NEXT: .Lcfi1846: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -51781,6 +52899,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -51809,20 +52929,18 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1567: +; NoVLX-NEXT: .Lcfi1847: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1568: +; NoVLX-NEXT: .Lcfi1848: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1569: +; NoVLX-NEXT: .Lcfi1849: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovaps (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -51856,6 +52974,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -51885,20 +53005,18 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1570: +; NoVLX-NEXT: .Lcfi1850: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1571: +; NoVLX-NEXT: .Lcfi1851: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1572: +; NoVLX-NEXT: .Lcfi1852: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -51932,6 +53050,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -51963,12 +53083,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1573: +; NoVLX-NEXT: .Lcfi1853: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1574: +; NoVLX-NEXT: .Lcfi1854: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1575: +; NoVLX-NEXT: .Lcfi1855: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -51977,8 +53097,6 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -52012,6 +53130,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52043,12 +53163,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1576: +; NoVLX-NEXT: .Lcfi1856: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1577: +; NoVLX-NEXT: .Lcfi1857: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1578: +; NoVLX-NEXT: .Lcfi1858: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52057,8 +53177,6 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -52092,6 +53210,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52124,12 +53244,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1579: +; NoVLX-NEXT: .Lcfi1859: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1580: +; NoVLX-NEXT: .Lcfi1860: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1581: +; NoVLX-NEXT: .Lcfi1861: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52138,8 +53258,6 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -52173,6 +53291,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52207,55 +53327,55 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1582: +; NoVLX-NEXT: .Lcfi1862: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1583: +; NoVLX-NEXT: .Lcfi1863: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1584: +; NoVLX-NEXT: .Lcfi1864: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52287,55 +53407,55 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1585: +; NoVLX-NEXT: .Lcfi1865: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1586: +; NoVLX-NEXT: .Lcfi1866: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1587: +; NoVLX-NEXT: .Lcfi1867: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vmovaps (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52368,55 +53488,55 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1588: +; NoVLX-NEXT: .Lcfi1868: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1589: +; NoVLX-NEXT: .Lcfi1869: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1590: +; NoVLX-NEXT: .Lcfi1870: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52451,12 +53571,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1591: +; NoVLX-NEXT: .Lcfi1871: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1592: +; NoVLX-NEXT: .Lcfi1872: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1593: +; NoVLX-NEXT: .Lcfi1873: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52465,43 +53585,43 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52536,12 +53656,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1594: +; NoVLX-NEXT: .Lcfi1874: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1595: +; NoVLX-NEXT: .Lcfi1875: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1596: +; NoVLX-NEXT: .Lcfi1876: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52550,43 +53670,43 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52622,12 +53742,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1597: +; NoVLX-NEXT: .Lcfi1877: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1598: +; NoVLX-NEXT: .Lcfi1878: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1599: +; NoVLX-NEXT: .Lcfi1879: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52636,43 +53756,43 @@ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -52710,87 +53830,107 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1600: +; NoVLX-NEXT: .Lcfi1880: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1601: +; NoVLX-NEXT: .Lcfi1881: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1602: +; NoVLX-NEXT: .Lcfi1882: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1883: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1884: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1885: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1886: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1887: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -52813,87 +53953,107 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1603: +; NoVLX-NEXT: .Lcfi1888: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1604: +; NoVLX-NEXT: .Lcfi1889: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1605: +; NoVLX-NEXT: .Lcfi1890: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1891: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1892: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1893: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1894: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1895: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -52917,87 +54077,107 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1606: +; NoVLX-NEXT: .Lcfi1896: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1607: +; NoVLX-NEXT: .Lcfi1897: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1608: +; NoVLX-NEXT: .Lcfi1898: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1899: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1900: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1901: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1902: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1903: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -53023,88 +54203,108 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1609: +; NoVLX-NEXT: .Lcfi1904: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1610: +; NoVLX-NEXT: .Lcfi1905: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1611: +; NoVLX-NEXT: .Lcfi1906: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1907: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1908: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1909: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1910: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1911: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -53130,88 +54330,108 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1612: +; NoVLX-NEXT: .Lcfi1912: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1613: +; NoVLX-NEXT: .Lcfi1913: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1614: +; NoVLX-NEXT: .Lcfi1914: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1915: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1916: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1917: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1918: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1919: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -53238,88 +54458,108 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1615: +; NoVLX-NEXT: .Lcfi1920: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1616: +; NoVLX-NEXT: .Lcfi1921: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1617: +; NoVLX-NEXT: .Lcfi1922: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1923: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1924: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1925: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1926: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1927: +; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r11d ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r14d ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r15d ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r12d ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r13d ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ebx ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 -; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 -; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 -; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, (%rsp) ; NoVLX-NEXT: movl (%rsp), %eax -; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -53394,12 +54634,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1618: +; NoVLX-NEXT: .Lcfi1928: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1619: +; NoVLX-NEXT: .Lcfi1929: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1620: +; NoVLX-NEXT: .Lcfi1930: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53408,17 +54648,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1621: +; NoVLX-NEXT: .Lcfi1931: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1622: +; NoVLX-NEXT: .Lcfi1932: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1623: +; NoVLX-NEXT: .Lcfi1933: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1624: +; NoVLX-NEXT: .Lcfi1934: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1625: +; NoVLX-NEXT: .Lcfi1935: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53461,11 +54705,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53477,15 +54721,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53522,12 +54762,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1626: +; NoVLX-NEXT: .Lcfi1936: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1627: +; NoVLX-NEXT: .Lcfi1937: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1628: +; NoVLX-NEXT: .Lcfi1938: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53536,17 +54776,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1629: +; NoVLX-NEXT: .Lcfi1939: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1630: +; NoVLX-NEXT: .Lcfi1940: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1631: +; NoVLX-NEXT: .Lcfi1941: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1632: +; NoVLX-NEXT: .Lcfi1942: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1633: +; NoVLX-NEXT: .Lcfi1943: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53589,11 +54833,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53605,15 +54849,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53651,12 +54891,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1634: +; NoVLX-NEXT: .Lcfi1944: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1635: +; NoVLX-NEXT: .Lcfi1945: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1636: +; NoVLX-NEXT: .Lcfi1946: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53665,17 +54905,21 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1637: +; NoVLX-NEXT: .Lcfi1947: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1638: +; NoVLX-NEXT: .Lcfi1948: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1639: +; NoVLX-NEXT: .Lcfi1949: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1640: +; NoVLX-NEXT: .Lcfi1950: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1641: +; NoVLX-NEXT: .Lcfi1951: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53718,11 +54962,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53734,15 +54978,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53782,12 +55022,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1642: +; NoVLX-NEXT: .Lcfi1952: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1643: +; NoVLX-NEXT: .Lcfi1953: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1644: +; NoVLX-NEXT: .Lcfi1954: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53796,18 +55036,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1645: +; NoVLX-NEXT: .Lcfi1955: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1646: +; NoVLX-NEXT: .Lcfi1956: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1647: +; NoVLX-NEXT: .Lcfi1957: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1648: +; NoVLX-NEXT: .Lcfi1958: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1649: +; NoVLX-NEXT: .Lcfi1959: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53850,11 +55094,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53866,15 +55110,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -53914,12 +55154,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1650: +; NoVLX-NEXT: .Lcfi1960: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1651: +; NoVLX-NEXT: .Lcfi1961: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1652: +; NoVLX-NEXT: .Lcfi1962: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53928,18 +55168,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1653: +; NoVLX-NEXT: .Lcfi1963: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1654: +; NoVLX-NEXT: .Lcfi1964: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1655: +; NoVLX-NEXT: .Lcfi1965: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1656: +; NoVLX-NEXT: .Lcfi1966: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1657: +; NoVLX-NEXT: .Lcfi1967: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -53982,11 +55226,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -53998,15 +55242,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -54047,12 +55287,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1658: +; NoVLX-NEXT: .Lcfi1968: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1659: +; NoVLX-NEXT: .Lcfi1969: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1660: +; NoVLX-NEXT: .Lcfi1970: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -54061,18 +55301,22 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1661: +; NoVLX-NEXT: .Lcfi1971: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1662: +; NoVLX-NEXT: .Lcfi1972: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1663: +; NoVLX-NEXT: .Lcfi1973: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1664: +; NoVLX-NEXT: .Lcfi1974: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1665: +; NoVLX-NEXT: .Lcfi1975: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -54115,11 +55359,11 @@ ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r10d ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 @@ -54131,15 +55375,11 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -54605,8 +55845,8 @@ ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 @@ -54967,12 +56207,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1666: +; NoVLX-NEXT: .Lcfi1976: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1667: +; NoVLX-NEXT: .Lcfi1977: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1668: +; NoVLX-NEXT: .Lcfi1978: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -55010,12 +56250,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1669: +; NoVLX-NEXT: .Lcfi1979: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1670: +; NoVLX-NEXT: .Lcfi1980: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1671: +; NoVLX-NEXT: .Lcfi1981: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -55054,12 +56294,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1672: +; NoVLX-NEXT: .Lcfi1982: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1673: +; NoVLX-NEXT: .Lcfi1983: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1674: +; NoVLX-NEXT: .Lcfi1984: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -55102,12 +56342,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1675: +; NoVLX-NEXT: .Lcfi1985: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1676: +; NoVLX-NEXT: .Lcfi1986: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1677: +; NoVLX-NEXT: .Lcfi1987: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -55154,20 +56394,20 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1678: +; NoVLX-NEXT: .Lcfi1988: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1679: +; NoVLX-NEXT: .Lcfi1989: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1680: +; NoVLX-NEXT: .Lcfi1990: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -55207,12 +56447,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1681: +; NoVLX-NEXT: .Lcfi1991: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1682: +; NoVLX-NEXT: .Lcfi1992: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1683: +; NoVLX-NEXT: .Lcfi1993: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -55262,20 +56502,20 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1684: +; NoVLX-NEXT: .Lcfi1994: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1685: +; NoVLX-NEXT: .Lcfi1995: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1686: +; NoVLX-NEXT: .Lcfi1996: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55311,20 +56551,20 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1687: +; NoVLX-NEXT: .Lcfi1997: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1688: +; NoVLX-NEXT: .Lcfi1998: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1689: +; NoVLX-NEXT: .Lcfi1999: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55361,12 +56601,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1690: +; NoVLX-NEXT: .Lcfi2000: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1691: +; NoVLX-NEXT: .Lcfi2001: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1692: +; NoVLX-NEXT: .Lcfi2002: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -55374,8 +56614,8 @@ ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55415,12 +56655,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1693: +; NoVLX-NEXT: .Lcfi2003: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1694: +; NoVLX-NEXT: .Lcfi2004: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1695: +; NoVLX-NEXT: .Lcfi2005: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -55432,8 +56672,8 @@ ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55473,12 +56713,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1696: +; NoVLX-NEXT: .Lcfi2006: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1697: +; NoVLX-NEXT: .Lcfi2007: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1698: +; NoVLX-NEXT: .Lcfi2008: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -55490,8 +56730,8 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55532,12 +56772,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1699: +; NoVLX-NEXT: .Lcfi2009: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1700: +; NoVLX-NEXT: .Lcfi2010: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1701: +; NoVLX-NEXT: .Lcfi2011: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -55550,8 +56790,8 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -55598,8 +56838,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55655,8 +56895,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55714,8 +56954,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55781,8 +57021,8 @@ ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55848,8 +57088,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55917,8 +57157,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -56352,12 +57592,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1702: +; NoVLX-NEXT: .Lcfi2012: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1703: +; NoVLX-NEXT: .Lcfi2013: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1704: +; NoVLX-NEXT: .Lcfi2014: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56397,12 +57637,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1705: +; NoVLX-NEXT: .Lcfi2015: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1706: +; NoVLX-NEXT: .Lcfi2016: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1707: +; NoVLX-NEXT: .Lcfi2017: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56443,12 +57683,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1708: +; NoVLX-NEXT: .Lcfi2018: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1709: +; NoVLX-NEXT: .Lcfi2019: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1710: +; NoVLX-NEXT: .Lcfi2020: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56493,12 +57733,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1711: +; NoVLX-NEXT: .Lcfi2021: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1712: +; NoVLX-NEXT: .Lcfi2022: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1713: +; NoVLX-NEXT: .Lcfi2023: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56548,12 +57788,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1714: +; NoVLX-NEXT: .Lcfi2024: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1715: +; NoVLX-NEXT: .Lcfi2025: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1716: +; NoVLX-NEXT: .Lcfi2026: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56604,12 +57844,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1717: +; NoVLX-NEXT: .Lcfi2027: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1718: +; NoVLX-NEXT: .Lcfi2028: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1719: +; NoVLX-NEXT: .Lcfi2029: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56662,12 +57902,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1720: +; NoVLX-NEXT: .Lcfi2030: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1721: +; NoVLX-NEXT: .Lcfi2031: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1722: +; NoVLX-NEXT: .Lcfi2032: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56675,8 +57915,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56713,12 +57953,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1723: +; NoVLX-NEXT: .Lcfi2033: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1724: +; NoVLX-NEXT: .Lcfi2034: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1725: +; NoVLX-NEXT: .Lcfi2035: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56726,8 +57966,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56765,12 +58005,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1726: +; NoVLX-NEXT: .Lcfi2036: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1727: +; NoVLX-NEXT: .Lcfi2037: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1728: +; NoVLX-NEXT: .Lcfi2038: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56779,8 +58019,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56821,12 +58061,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1729: +; NoVLX-NEXT: .Lcfi2039: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1730: +; NoVLX-NEXT: .Lcfi2040: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1731: +; NoVLX-NEXT: .Lcfi2041: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56840,8 +58080,8 @@ ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56882,12 +58122,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1732: +; NoVLX-NEXT: .Lcfi2042: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1733: +; NoVLX-NEXT: .Lcfi2043: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1734: +; NoVLX-NEXT: .Lcfi2044: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56901,8 +58141,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -56944,12 +58184,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1735: +; NoVLX-NEXT: .Lcfi2045: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1736: +; NoVLX-NEXT: .Lcfi2046: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1737: +; NoVLX-NEXT: .Lcfi2047: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56964,8 +58204,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 @@ -57226,18 +58466,16 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1738: +; NoVLX-NEXT: .Lcfi2048: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1739: +; NoVLX-NEXT: .Lcfi2049: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1740: +; NoVLX-NEXT: .Lcfi2050: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -57271,6 +58509,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57299,18 +58539,16 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1741: +; NoVLX-NEXT: .Lcfi2051: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1742: +; NoVLX-NEXT: .Lcfi2052: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1743: +; NoVLX-NEXT: .Lcfi2053: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -57344,6 +58582,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57373,18 +58613,16 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1744: +; NoVLX-NEXT: .Lcfi2054: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1745: +; NoVLX-NEXT: .Lcfi2055: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1746: +; NoVLX-NEXT: .Lcfi2056: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -57418,6 +58656,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57449,19 +58689,17 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1747: +; NoVLX-NEXT: .Lcfi2057: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1748: +; NoVLX-NEXT: .Lcfi2058: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1749: +; NoVLX-NEXT: .Lcfi2059: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -57495,6 +58733,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57526,19 +58766,17 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1750: +; NoVLX-NEXT: .Lcfi2060: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1751: +; NoVLX-NEXT: .Lcfi2061: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1752: +; NoVLX-NEXT: .Lcfi2062: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -57572,6 +58810,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57604,19 +58844,17 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1753: +; NoVLX-NEXT: .Lcfi2063: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1754: +; NoVLX-NEXT: .Lcfi2064: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1755: +; NoVLX-NEXT: .Lcfi2065: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} -; NoVLX-NEXT: kxorw %k0, %k0, %k1 -; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %r8d @@ -57650,6 +58888,8 @@ ; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kxorw %k0, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57732,53 +58972,53 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1756: +; NoVLX-NEXT: .Lcfi2066: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1757: +; NoVLX-NEXT: .Lcfi2067: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1758: +; NoVLX-NEXT: .Lcfi2068: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57810,53 +59050,53 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1759: +; NoVLX-NEXT: .Lcfi2069: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1760: +; NoVLX-NEXT: .Lcfi2070: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1761: +; NoVLX-NEXT: .Lcfi2071: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57889,53 +59129,53 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1762: +; NoVLX-NEXT: .Lcfi2072: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1763: +; NoVLX-NEXT: .Lcfi2073: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1764: +; NoVLX-NEXT: .Lcfi2074: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -57970,54 +59210,54 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1765: +; NoVLX-NEXT: .Lcfi2075: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1766: +; NoVLX-NEXT: .Lcfi2076: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1767: +; NoVLX-NEXT: .Lcfi2077: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -58052,54 +59292,54 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1768: +; NoVLX-NEXT: .Lcfi2078: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1769: +; NoVLX-NEXT: .Lcfi2079: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1770: +; NoVLX-NEXT: .Lcfi2080: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -58135,54 +59375,54 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1771: +; NoVLX-NEXT: .Lcfi2081: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1772: +; NoVLX-NEXT: .Lcfi2082: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1773: +; NoVLX-NEXT: .Lcfi2083: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %r8d ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %r9d ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %esi ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edi ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 -; NoVLX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, %ecx -; NoVLX-NEXT: kxorw %k0, %k0, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 Index: test/CodeGen/X86/f16c-schedule.ll =================================================================== --- test/CodeGen/X86/f16c-schedule.ll +++ test/CodeGen/X86/f16c-schedule.ll @@ -30,9 +30,9 @@ ; ; SKYLAKE-LABEL: test_vcvtph2ps_128: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_vcvtph2ps_128: @@ -80,9 +80,9 @@ ; ; SKYLAKE-LABEL: test_vcvtph2ps_256: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [7:1.00] +; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_vcvtph2ps_256: @@ -127,8 +127,8 @@ ; ; SKYLAKE-LABEL: test_vcvtps2ph_128: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00] -; SKYLAKE-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [5:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_vcvtps2ph_128: @@ -174,8 +174,8 @@ ; ; SKYLAKE-LABEL: test_vcvtps2ph_256: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00] -; SKYLAKE-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [6:1.00] +; SKYLAKE-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [7:1.00] +; SKYLAKE-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00] ; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; Index: test/CodeGen/X86/pr32329.ll =================================================================== --- test/CodeGen/X86/pr32329.ll +++ test/CodeGen/X86/pr32329.ll @@ -36,33 +36,33 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .Lcfi7: ; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl obj, %edx ; X86-NEXT: movsbl var_27, %eax ; X86-NEXT: movl var_310, %ecx ; X86-NEXT: imull %eax, %ecx -; X86-NEXT: movl obj, %esi +; X86-NEXT: movzwl var_2, %esi ; X86-NEXT: addl var_24, %ecx -; X86-NEXT: movzwl var_2, %edi -; X86-NEXT: andl $4194303, %esi # imm = 0x3FFFFF -; X86-NEXT: leal (%esi,%esi), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: subl %edi, %ebx -; X86-NEXT: imull %ebx, %ecx +; X86-NEXT: andl $4194303, %edx # imm = 0x3FFFFF +; X86-NEXT: leal (%edx,%edx), %ebx +; X86-NEXT: subl %eax, %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: subl %esi, %edi +; X86-NEXT: imull %edi, %ecx ; X86-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 -; X86-NEXT: movl $9, %edi +; X86-NEXT: movl $9, %esi ; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: shldl %cl, %edi, %ebp -; X86-NEXT: shlxl %ecx, %edi, %edi +; X86-NEXT: shldl %cl, %esi, %ebp +; X86-NEXT: shlxl %ecx, %esi, %esi ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %edi, %ebp +; X86-NEXT: cmovnel %esi, %ebp ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovnel %ecx, %edi +; X86-NEXT: cmovnel %ecx, %esi ; X86-NEXT: movl %ebp, var_50+4 -; X86-NEXT: cmpl %esi, %ebx +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %esi, var_50 ; X86-NEXT: setge var_205 -; X86-NEXT: imull %eax, %edx -; X86-NEXT: movl %edi, var_50 -; X86-NEXT: movb %dl, var_218 +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: movb %bl, var_218 ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -71,25 +71,25 @@ ; ; X64-LABEL: foo: ; X64: # BB#0: # %entry -; X64-NEXT: movsbl {{.*}}(%rip), %eax -; X64-NEXT: movl {{.*}}(%rip), %ecx -; X64-NEXT: imull %eax, %ecx -; X64-NEXT: movl {{.*}}(%rip), %edx -; X64-NEXT: addl {{.*}}(%rip), %ecx +; X64-NEXT: movl {{.*}}(%rip), %eax +; X64-NEXT: movsbl {{.*}}(%rip), %r9d ; X64-NEXT: movzwl {{.*}}(%rip), %r8d -; X64-NEXT: andl $4194303, %edx # imm = 0x3FFFFF -; X64-NEXT: leal (%rdx,%rdx), %edi -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %esi -; X64-NEXT: subl %r8d, %esi -; X64-NEXT: imull %esi, %ecx -; X64-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 -; X64-NEXT: movl $9, %r8d -; X64-NEXT: cmpl %edx, %esi -; X64-NEXT: setge {{.*}}(%rip) -; X64-NEXT: shlxq %rcx, %r8, %rcx -; X64-NEXT: imull %eax, %edi +; X64-NEXT: movl {{.*}}(%rip), %esi +; X64-NEXT: imull %r9d, %esi +; X64-NEXT: addl {{.*}}(%rip), %esi +; X64-NEXT: andl $4194303, %eax # imm = 0x3FFFFF +; X64-NEXT: leal (%rax,%rax), %edi +; X64-NEXT: subl %r9d, %edi +; X64-NEXT: movl %edi, %edx +; X64-NEXT: subl %r8d, %edx +; X64-NEXT: imull %edx, %esi +; X64-NEXT: addl $-1437483407, %esi # imm = 0xAA51BE71 +; X64-NEXT: movl $9, %ecx +; X64-NEXT: shlxq %rsi, %rcx, %rcx ; X64-NEXT: movq %rcx, {{.*}}(%rip) +; X64-NEXT: cmpl %eax, %edx +; X64-NEXT: setge {{.*}}(%rip) +; X64-NEXT: imull %r9d, %edi ; X64-NEXT: movb %dil, {{.*}}(%rip) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -61,11 +61,17 @@ ; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; AVX512-LABEL: f32_no_estimate: -; AVX512: # BB#0: -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] -; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: f32_no_estimate: +; KNL: # BB#0: +; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] +; KNL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: f32_no_estimate: +; SKX: # BB#0: +; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] +; SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [11:1.00] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -136,12 +142,19 @@ ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; AVX512-LABEL: f32_one_step: -; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: f32_one_step: +; KNL: # BB#0: +; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: f32_one_step: +; SKX: # BB#0: +; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -242,16 +255,27 @@ ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; AVX512-LABEL: f32_two_step: -; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] -; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: f32_two_step: +; KNL: # BB#0: +; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] +; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: f32_two_step: +; SKX: # BB#0: +; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] +; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] +; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -300,11 +324,17 @@ ; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; -; AVX512-LABEL: v4f32_no_estimate: -; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50] -; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: v4f32_no_estimate: +; KNL: # BB#0: +; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: v4f32_no_estimate: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50] +; SKX-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [11:1.00] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -388,7 +418,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -506,10 +536,10 @@ ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -562,11 +592,17 @@ ; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; -; AVX512-LABEL: v8f32_no_estimate: -; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50] -; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: v8f32_no_estimate: +; KNL: # BB#0: +; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: v8f32_no_estimate: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; SKX-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [11:1.00] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -657,7 +693,7 @@ ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -788,10 +824,10 @@ ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.50] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -54,11 +54,17 @@ ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; -; AVX512-LABEL: f32_no_step_2: -; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: f32_no_step_2: +; KNL: # BB#0: +; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: f32_no_step_2: +; SKX: # BB#0: +; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1234.0, %x ret float %div } @@ -136,13 +142,21 @@ ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; -; AVX512-LABEL: f32_one_step_2: -; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: f32_one_step_2: +; KNL: # BB#0: +; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: f32_one_step_2: +; SKX: # BB#0: +; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 3456.0, %x ret float %div } @@ -227,14 +241,23 @@ ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; -; AVX512-LABEL: f32_one_step_2_divs: -; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] -; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: f32_one_step_2_divs: +; KNL: # BB#0: +; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] +; KNL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: f32_one_step_2_divs: +; SKX: # BB#0: +; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [4:0.50] +; SKX-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 3456.0, %x %div2 = fdiv fast float %div, %x ret float %div2 @@ -343,17 +366,29 @@ ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; -; AVX512-LABEL: f32_two_step_2: -; AVX512: # BB#0: -; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] -; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: retq # sched: [2:1.00] +; KNL-LABEL: f32_two_step_2: +; KNL: # BB#0: +; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] +; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] +; +; SKX-LABEL: f32_two_step_2: +; SKX: # BB#0: +; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 +; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] +; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] +; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 6789.0, %x ret float %div } @@ -445,8 +480,8 @@ ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -547,9 +582,9 @@ ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] -; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [4:0.50] +; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x %div2 = fdiv fast <4 x float> %div, %x @@ -676,11 +711,11 @@ ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.50] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div @@ -781,8 +816,8 @@ ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -892,9 +927,9 @@ ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50] -; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [4:0.50] +; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x %div2 = fdiv fast <8 x float> %div, %x @@ -1035,11 +1070,11 @@ ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] -; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.50] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.50] +; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div @@ -1149,7 +1184,7 @@ ; SKX-LABEL: v8f32_no_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div Index: test/CodeGen/X86/sse-schedule.ll =================================================================== --- test/CodeGen/X86/sse-schedule.ll +++ test/CodeGen/X86/sse-schedule.ll @@ -42,8 +42,8 @@ ; ; SKYLAKE-LABEL: test_addps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addps: @@ -96,8 +96,8 @@ ; ; SKYLAKE-LABEL: test_addss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addss: @@ -154,8 +154,8 @@ ; ; SKYLAKE-LABEL: test_andps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andps: @@ -216,8 +216,8 @@ ; ; SKYLAKE-LABEL: test_andnotps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotps: @@ -281,9 +281,9 @@ ; ; SKYLAKE-LABEL: test_cmpps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [4:0.33] +; SKYLAKE-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmpps: @@ -525,9 +525,9 @@ ; ; SKYLAKE-LABEL: test_cvtsi2ss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] -; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2ss: @@ -588,9 +588,9 @@ ; ; SKYLAKE-LABEL: test_cvtsi2ssq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [6:2.00] ; SKYLAKE-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] -; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2ssq: @@ -651,8 +651,8 @@ ; ; SKYLAKE-LABEL: test_cvtss2si: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtss2si %xmm0, %ecx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvtss2si (%rdi), %eax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtss2si %xmm0, %ecx # sched: [6:1.00] +; SKYLAKE-NEXT: vcvtss2si (%rdi), %eax # sched: [6:1.00] ; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -717,8 +717,8 @@ ; ; SKYLAKE-LABEL: test_cvtss2siq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtss2si %xmm0, %rcx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvtss2si (%rdi), %rax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtss2si %xmm0, %rcx # sched: [6:1.00] +; SKYLAKE-NEXT: vcvtss2si (%rdi), %rax # sched: [6:1.00] ; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -783,8 +783,8 @@ ; ; SKYLAKE-LABEL: test_cvttss2si: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttss2si %xmm0, %ecx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvttss2si (%rdi), %eax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvttss2si %xmm0, %ecx # sched: [7:1.00] +; SKYLAKE-NEXT: vcvttss2si (%rdi), %eax # sched: [6:1.00] ; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -846,8 +846,8 @@ ; ; SKYLAKE-LABEL: test_cvttss2siq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttss2si %xmm0, %rcx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvttss2si (%rdi), %rax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvttss2si %xmm0, %rcx # sched: [7:1.00] +; SKYLAKE-NEXT: vcvttss2si (%rdi), %rax # sched: [6:1.00] ; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -904,8 +904,8 @@ ; ; SKYLAKE-LABEL: test_divps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [13:1.00] -; SKYLAKE-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [13:1.00] +; SKYLAKE-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [11:1.00] +; SKYLAKE-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [11:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divps: @@ -958,8 +958,8 @@ ; ; SKYLAKE-LABEL: test_divss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [13:1.00] -; SKYLAKE-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [13:1.00] +; SKYLAKE-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [11:1.00] +; SKYLAKE-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [11:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divss: @@ -1068,8 +1068,8 @@ ; ; SKYLAKE-LABEL: test_maxps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxps: @@ -1123,8 +1123,8 @@ ; ; SKYLAKE-LABEL: test_maxss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxss: @@ -1178,8 +1178,8 @@ ; ; SKYLAKE-LABEL: test_minps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minps: @@ -1233,8 +1233,8 @@ ; ; SKYLAKE-LABEL: test_minss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minss: @@ -1294,7 +1294,7 @@ ; SKYLAKE-LABEL: test_movaps: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovaps (%rdi), %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1412,7 +1412,7 @@ ; SKYLAKE-LABEL: test_movhps: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1475,7 +1475,7 @@ ; SKYLAKE-LABEL: test_movlhps: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movlhps: @@ -1533,7 +1533,7 @@ ; SKYLAKE-LABEL: test_movlps: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1590,7 +1590,7 @@ ; ; SKYLAKE-LABEL: test_movmskps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00] +; SKYLAKE-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movmskps: @@ -1696,7 +1696,7 @@ ; SKYLAKE-LABEL: test_movss_mem: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50] -; SKYLAKE-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1808,7 +1808,7 @@ ; SKYLAKE-LABEL: test_movups: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovups (%rdi), %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1864,8 +1864,8 @@ ; ; SKYLAKE-LABEL: test_mulps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; SKYLAKE-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SKYLAKE-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulps: @@ -1918,8 +1918,8 @@ ; ; SKYLAKE-LABEL: test_mulss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; SKYLAKE-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SKYLAKE-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulss: @@ -1976,8 +1976,8 @@ ; ; SKYLAKE-LABEL: test_orps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_orps: @@ -2092,9 +2092,9 @@ ; ; SKYLAKE-LABEL: test_rcpps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vrcpps (%rdi), %xmm1 # sched: [5:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vrcpps (%rdi), %xmm1 # sched: [4:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rcpps: @@ -2163,10 +2163,10 @@ ; ; SKYLAKE-LABEL: test_rcpss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] ; SKYLAKE-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] -; SKYLAKE-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [4:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rcpss: @@ -2234,9 +2234,9 @@ ; ; SKYLAKE-LABEL: test_rsqrtps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [5:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [4:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rsqrtps: @@ -2305,10 +2305,10 @@ ; ; SKYLAKE-LABEL: test_rsqrtss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] ; SKYLAKE-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] -; SKYLAKE-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [4:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rsqrtss: @@ -2484,9 +2484,9 @@ ; ; SKYLAKE-LABEL: test_sqrtps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00] -; SKYLAKE-NEXT: vsqrtps (%rdi), %xmm1 # sched: [14:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsqrtps %xmm0, %xmm0 # sched: [12:1.00] +; SKYLAKE-NEXT: vsqrtps (%rdi), %xmm1 # sched: [12:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtps: @@ -2555,10 +2555,10 @@ ; ; SKYLAKE-LABEL: test_sqrtss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00] +; SKYLAKE-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00] ; SKYLAKE-NEXT: vmovaps (%rdi), %xmm1 # sched: [1:0.50] -; SKYLAKE-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtss: @@ -2673,8 +2673,8 @@ ; ; SKYLAKE-LABEL: test_subps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subps: @@ -2727,8 +2727,8 @@ ; ; SKYLAKE-LABEL: test_subss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subss: @@ -3021,8 +3021,8 @@ ; ; SKYLAKE-LABEL: test_xorps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorps: Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -42,8 +42,8 @@ ; ; SKYLAKE-LABEL: test_addpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addpd: @@ -96,8 +96,8 @@ ; ; SKYLAKE-LABEL: test_addsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsd: @@ -155,9 +155,9 @@ ; ; SKYLAKE-LABEL: test_andpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andpd: @@ -222,9 +222,9 @@ ; ; SKYLAKE-LABEL: test_andnotpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotpd: @@ -291,9 +291,9 @@ ; ; SKYLAKE-LABEL: test_cmppd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [4:0.33] +; SKYLAKE-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmppd: @@ -535,9 +535,9 @@ ; ; SKYLAKE-LABEL: test_cvtdq2pd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] -; SKYLAKE-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [4:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [5:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2pd: @@ -601,9 +601,9 @@ ; ; SKYLAKE-LABEL: test_cvtdq2ps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2ps: @@ -665,9 +665,9 @@ ; ; SKYLAKE-LABEL: test_cvtpd2dq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtpd2dq: @@ -730,9 +730,9 @@ ; ; SKYLAKE-LABEL: test_cvtpd2ps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtpd2ps: @@ -795,9 +795,9 @@ ; ; SKYLAKE-LABEL: test_cvtps2dq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtps2dq: @@ -860,9 +860,9 @@ ; ; SKYLAKE-LABEL: test_cvtps2pd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtps2pd: @@ -925,8 +925,8 @@ ; ; SKYLAKE-LABEL: test_cvtsd2si: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvtsd2si (%rdi), %eax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtsd2si %xmm0, %ecx # sched: [6:1.00] +; SKYLAKE-NEXT: vcvtsd2si (%rdi), %eax # sched: [6:1.00] ; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -991,8 +991,8 @@ ; ; SKYLAKE-LABEL: test_cvtsd2siq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvtsd2si (%rdi), %rax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtsd2si %xmm0, %rcx # sched: [6:1.00] +; SKYLAKE-NEXT: vcvtsd2si (%rdi), %rax # sched: [6:1.00] ; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1063,10 +1063,10 @@ ; ; SKYLAKE-LABEL: test_cvtsd2ss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50] -; SKYLAKE-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00] -; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsd2ss: @@ -1129,9 +1129,9 @@ ; ; SKYLAKE-LABEL: test_cvtsi2sd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] -; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2sd: @@ -1192,9 +1192,9 @@ ; ; SKYLAKE-LABEL: test_cvtsi2sdq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] -; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2sdq: @@ -1263,10 +1263,10 @@ ; ; SKYLAKE-LABEL: test_cvtss2sd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00] +; SKYLAKE-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] -; SKYLAKE-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00] -; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00] +; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtss2sd: @@ -1330,9 +1330,9 @@ ; ; SKYLAKE-LABEL: test_cvttpd2dq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; SKYLAKE-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00] ; SKYLAKE-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttpd2dq: @@ -1396,9 +1396,9 @@ ; ; SKYLAKE-LABEL: test_cvttps2dq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [3:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [4:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttps2dq: @@ -1459,8 +1459,8 @@ ; ; SKYLAKE-LABEL: test_cvttsd2si: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvttsd2si (%rdi), %eax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvttsd2si %xmm0, %ecx # sched: [6:1.00] +; SKYLAKE-NEXT: vcvttsd2si (%rdi), %eax # sched: [6:1.00] ; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1522,8 +1522,8 @@ ; ; SKYLAKE-LABEL: test_cvttsd2siq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00] -; SKYLAKE-NEXT: vcvttsd2si (%rdi), %rax # sched: [4:1.00] +; SKYLAKE-NEXT: vcvttsd2si %xmm0, %rcx # sched: [6:1.00] +; SKYLAKE-NEXT: vcvttsd2si (%rdi), %rax # sched: [6:1.00] ; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -1580,8 +1580,8 @@ ; ; SKYLAKE-LABEL: test_divpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [20:1.00] -; SKYLAKE-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:1.00] +; SKYLAKE-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [14:1.00] +; SKYLAKE-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [14:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divpd: @@ -1634,8 +1634,8 @@ ; ; SKYLAKE-LABEL: test_divsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [20:1.00] -; SKYLAKE-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [20:1.00] +; SKYLAKE-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [14:1.00] +; SKYLAKE-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [14:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divsd: @@ -1839,8 +1839,8 @@ ; ; SKYLAKE-LABEL: test_maxpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxpd: @@ -1894,8 +1894,8 @@ ; ; SKYLAKE-LABEL: test_maxsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxsd: @@ -1949,8 +1949,8 @@ ; ; SKYLAKE-LABEL: test_minpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minpd: @@ -2004,8 +2004,8 @@ ; ; SKYLAKE-LABEL: test_minsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minsd: @@ -2065,7 +2065,7 @@ ; SKYLAKE-LABEL: test_movapd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovapd (%rdi), %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2127,7 +2127,7 @@ ; SKYLAKE-LABEL: test_movdqa: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovdqa (%rdi), %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2189,7 +2189,7 @@ ; SKYLAKE-LABEL: test_movdqu: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovdqu (%rdi), %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2267,9 +2267,9 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00] ; SKYLAKE-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vmovd %xmm0, %eax # sched: [1:1.00] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.33] +; SKYLAKE-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vmovd %xmm0, %eax # sched: [2:1.00] ; SKYLAKE-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2358,9 +2358,9 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00] ; SKYLAKE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vmovq %xmm0, %rax # sched: [1:1.00] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.33] +; SKYLAKE-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vmovq %xmm0, %rax # sched: [2:1.00] ; SKYLAKE-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2433,7 +2433,7 @@ ; SKYLAKE-LABEL: test_movhpd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2498,7 +2498,7 @@ ; SKYLAKE-LABEL: test_movlpd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2554,7 +2554,7 @@ ; ; SKYLAKE-LABEL: test_movmskpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00] +; SKYLAKE-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movmskpd: @@ -2606,7 +2606,7 @@ ; ; SKYLAKE-LABEL: test_movntdqa: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2659,7 +2659,7 @@ ; ; SKYLAKE-LABEL: test_movntpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2718,7 +2718,7 @@ ; SKYLAKE-LABEL: test_movq_mem: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2779,7 +2779,7 @@ ; SKYLAKE-LABEL: test_movq_reg: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33] -; SKYLAKE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movq_reg: @@ -2837,7 +2837,7 @@ ; SKYLAKE-LABEL: test_movsd_mem: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50] -; SKYLAKE-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -2950,7 +2950,7 @@ ; SKYLAKE-LABEL: test_movupd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovupd (%rdi), %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -3006,8 +3006,8 @@ ; ; SKYLAKE-LABEL: test_mulpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; SKYLAKE-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SKYLAKE-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulpd: @@ -3060,8 +3060,8 @@ ; ; SKYLAKE-LABEL: test_mulsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; SKYLAKE-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SKYLAKE-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulsd: @@ -3119,9 +3119,9 @@ ; ; SKYLAKE-LABEL: test_orpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_orpd: @@ -3365,7 +3365,7 @@ ; ; SKYLAKE-LABEL: test_paddb: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -3423,7 +3423,7 @@ ; ; SKYLAKE-LABEL: test_paddd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -3477,7 +3477,7 @@ ; ; SKYLAKE-LABEL: test_paddq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -3771,7 +3771,7 @@ ; ; SKYLAKE-LABEL: test_paddw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -3832,7 +3832,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pand: @@ -3901,7 +3901,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pandn: @@ -4476,7 +4476,7 @@ ; ; SKYLAKE-LABEL: test_pextrw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpextrw $6, %xmm0, %eax # sched: [2:1.00] +; SKYLAKE-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00] ; SKYLAKE-NEXT: # kill: %AX %AX %EAX ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -4594,8 +4594,8 @@ ; ; SKYLAKE-LABEL: test_pmaddwd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaddwd: @@ -4883,7 +4883,7 @@ ; ; SKYLAKE-LABEL: test_pmovmskb: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00] +; SKYLAKE-NEXT: vpmovmskb %xmm0, %eax # sched: [2:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovmskb: @@ -4933,8 +4933,8 @@ ; ; SKYLAKE-LABEL: test_pmulhuw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulhuw: @@ -4988,8 +4988,8 @@ ; ; SKYLAKE-LABEL: test_pmulhw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulhw: @@ -5043,8 +5043,8 @@ ; ; SKYLAKE-LABEL: test_pmullw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmullw: @@ -5105,8 +5105,8 @@ ; ; SKYLAKE-LABEL: test_pmuludq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmuludq: @@ -5168,7 +5168,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_por: @@ -5232,8 +5232,8 @@ ; ; SKYLAKE-LABEL: test_psadbw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [3:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psadbw: @@ -5297,7 +5297,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00] ; SKYLAKE-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pshufd: @@ -5362,7 +5362,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00] ; SKYLAKE-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00] -; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pshufhw: @@ -5427,7 +5427,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00] ; SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00] -; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pshuflw: @@ -5489,8 +5489,8 @@ ; SKYLAKE-LABEL: test_pslld: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pslld: @@ -5604,8 +5604,8 @@ ; SKYLAKE-LABEL: test_psllq: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psllq: @@ -5669,8 +5669,8 @@ ; SKYLAKE-LABEL: test_psllw: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psllw: @@ -5734,8 +5734,8 @@ ; SKYLAKE-LABEL: test_psrad: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrad: @@ -5799,8 +5799,8 @@ ; SKYLAKE-LABEL: test_psraw: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psraw: @@ -5864,8 +5864,8 @@ ; SKYLAKE-LABEL: test_psrld: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrld: @@ -5979,8 +5979,8 @@ ; SKYLAKE-LABEL: test_psrlq: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrlq: @@ -6044,8 +6044,8 @@ ; SKYLAKE-LABEL: test_psrlw: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [2:1.00] -; SKYLAKE-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00] +; SKYLAKE-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrlw: @@ -6107,7 +6107,7 @@ ; ; SKYLAKE-LABEL: test_psubb: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -6165,7 +6165,7 @@ ; ; SKYLAKE-LABEL: test_psubd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -6219,7 +6219,7 @@ ; ; SKYLAKE-LABEL: test_psubq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -6513,7 +6513,7 @@ ; ; SKYLAKE-LABEL: test_psubw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -6634,7 +6634,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] ; SKYLAKE-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckhdq: @@ -6697,7 +6697,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] ; SKYLAKE-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckhqdq: @@ -6878,7 +6878,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] ; SKYLAKE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckldq: @@ -6941,7 +6941,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] ; SKYLAKE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpcklqdq: @@ -7062,7 +7062,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [1:0.50] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pxor: @@ -7125,7 +7125,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] ; SKYLAKE-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_shufpd: @@ -7187,9 +7187,9 @@ ; ; SKYLAKE-LABEL: test_sqrtpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:1.00] -; SKYLAKE-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [21:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [18:1.00] +; SKYLAKE-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [18:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtpd: @@ -7258,10 +7258,10 @@ ; ; SKYLAKE-LABEL: test_sqrtsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00] +; SKYLAKE-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00] ; SKYLAKE-NEXT: vmovapd (%rdi), %xmm1 # sched: [1:0.50] -; SKYLAKE-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [18:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtsd: @@ -7320,8 +7320,8 @@ ; ; SKYLAKE-LABEL: test_subpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subpd: @@ -7374,8 +7374,8 @@ ; ; SKYLAKE-LABEL: test_subsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subsd: @@ -7555,7 +7555,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] ; SKYLAKE-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpckhpd: @@ -7624,7 +7624,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] ; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpcklpd: @@ -7685,9 +7685,9 @@ ; ; SKYLAKE-LABEL: test_xorpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SKYLAKE-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorpd: Index: test/CodeGen/X86/sse3-schedule.ll =================================================================== --- test/CodeGen/X86/sse3-schedule.ll +++ test/CodeGen/X86/sse3-schedule.ll @@ -42,8 +42,8 @@ ; ; SKYLAKE-LABEL: test_addsubpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubpd: @@ -97,8 +97,8 @@ ; ; SKYLAKE-LABEL: test_addsubps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SKYLAKE-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubps: @@ -152,8 +152,8 @@ ; ; SKYLAKE-LABEL: test_haddpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddpd: @@ -207,8 +207,8 @@ ; ; SKYLAKE-LABEL: test_haddps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddps: @@ -262,8 +262,8 @@ ; ; SKYLAKE-LABEL: test_hsubpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubpd: @@ -317,8 +317,8 @@ ; ; SKYLAKE-LABEL: test_hsubps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; SKYLAKE-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; SKYLAKE-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [6:2.00] +; SKYLAKE-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [6:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubps: @@ -488,7 +488,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] ; SKYLAKE-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [1:0.50] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movddup: @@ -552,7 +552,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00] ; SKYLAKE-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movshdup: @@ -616,7 +616,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00] ; SKYLAKE-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [1:0.50] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movsldup: Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -40,7 +40,7 @@ ; SKYLAKE-LABEL: test_blendpd: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -145,8 +145,8 @@ ; ; SKYLAKE-LABEL: test_blendvpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] -; SKYLAKE-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00] +; SKYLAKE-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67] +; SKYLAKE-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:0.67] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvpd: @@ -200,8 +200,8 @@ ; ; SKYLAKE-LABEL: test_blendvps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] -; SKYLAKE-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00] +; SKYLAKE-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67] +; SKYLAKE-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:0.67] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvps: @@ -298,8 +298,8 @@ ; ; SKYLAKE-LABEL: test_dpps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00] -; SKYLAKE-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [14:2.00] +; SKYLAKE-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [13:1.33] +; SKYLAKE-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_dpps: @@ -436,8 +436,8 @@ ; ; SKYLAKE-LABEL: test_mpsadbw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:2.00] -; SKYLAKE-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [7:2.00] +; SKYLAKE-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [4:2.00] +; SKYLAKE-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [4:2.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mpsadbw: @@ -542,8 +542,8 @@ ; ; SKYLAKE-LABEL: test_pblendvb: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] -; SKYLAKE-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00] +; SKYLAKE-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67] +; SKYLAKE-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:0.67] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pblendvb: @@ -592,7 +592,7 @@ ; SKYLAKE-LABEL: test_pblendw: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00] -; SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00] +; SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pblendw: @@ -689,7 +689,7 @@ ; ; SKYLAKE-LABEL: test_pextrb: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpextrb $3, %xmm0, %eax # sched: [2:1.00] +; SKYLAKE-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00] ; SKYLAKE-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -738,7 +738,7 @@ ; ; SKYLAKE-LABEL: test_pextrd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00] +; SKYLAKE-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00] ; SKYLAKE-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -786,7 +786,7 @@ ; ; SKYLAKE-LABEL: test_pextrq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpextrq $1, %xmm0, %rax # sched: [2:1.00] +; SKYLAKE-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00] ; SKYLAKE-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -834,7 +834,7 @@ ; ; SKYLAKE-LABEL: test_pextrw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpextrw $3, %xmm0, %eax # sched: [2:1.00] +; SKYLAKE-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00] ; SKYLAKE-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; @@ -883,8 +883,8 @@ ; ; SKYLAKE-LABEL: test_phminposuw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vphminposuw (%rdi), %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vphminposuw (%rdi), %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phminposuw: @@ -1034,7 +1034,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00] ; SKYLAKE-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pinsrq: @@ -1483,7 +1483,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxbw: @@ -1541,7 +1541,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxbd: @@ -1599,7 +1599,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxbq: @@ -1657,7 +1657,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxdq: @@ -1715,7 +1715,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxwd: @@ -1773,7 +1773,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00] ; SKYLAKE-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxwq: @@ -1831,7 +1831,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] ; SKYLAKE-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [1:1.00] -; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxbw: @@ -1889,7 +1889,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00] ; SKYLAKE-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [1:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxbd: @@ -1947,7 +1947,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] ; SKYLAKE-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxbq: @@ -2005,7 +2005,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00] ; SKYLAKE-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxdq: @@ -2063,7 +2063,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] ; SKYLAKE-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [1:1.00] -; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxwd: @@ -2121,7 +2121,7 @@ ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00] ; SKYLAKE-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [1:1.00] -; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxwq: @@ -2172,8 +2172,8 @@ ; ; SKYLAKE-LABEL: test_pmuldq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmuldq: @@ -2222,8 +2222,8 @@ ; ; SKYLAKE-LABEL: test_pmulld: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00] -; SKYLAKE-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00] +; SKYLAKE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [8:0.67] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulld: @@ -2286,9 +2286,9 @@ ; ; SKYLAKE-LABEL: test_ptest: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00] +; SKYLAKE-NEXT: vptest %xmm1, %xmm0 # sched: [3:1.00] ; SKYLAKE-NEXT: setb %al # sched: [1:0.50] -; SKYLAKE-NEXT: vptest (%rdi), %xmm0 # sched: [2:1.00] +; SKYLAKE-NEXT: vptest (%rdi), %xmm0 # sched: [3:1.00] ; SKYLAKE-NEXT: setb %cl # sched: [1:0.50] ; SKYLAKE-NEXT: andb %al, %cl # sched: [1:0.25] ; SKYLAKE-NEXT: movzbl %cl, %eax # sched: [1:0.25] @@ -2353,9 +2353,9 @@ ; ; SKYLAKE-LABEL: test_roundpd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [5:1.25] -; SKYLAKE-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [6:2.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [8:0.67] +; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundpd: @@ -2411,9 +2411,9 @@ ; ; SKYLAKE-LABEL: test_roundps: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [5:1.25] -; SKYLAKE-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [6:2.00] -; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [8:0.67] +; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundps: @@ -2470,9 +2470,9 @@ ; ; SKYLAKE-LABEL: test_roundsd: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25] -; SKYLAKE-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67] +; SKYLAKE-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundsd: @@ -2529,9 +2529,9 @@ ; ; SKYLAKE-LABEL: test_roundss: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25] -; SKYLAKE-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67] +; SKYLAKE-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:0.67] +; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundss: Index: test/CodeGen/X86/sse42-schedule.ll =================================================================== --- test/CodeGen/X86/sse42-schedule.ll +++ test/CodeGen/X86/sse42-schedule.ll @@ -503,9 +503,9 @@ ; ; SKYLAKE-LABEL: test_pcmpistri: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00] +; SKYLAKE-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [10:3.00] ; SKYLAKE-NEXT: movl %ecx, %eax # sched: [1:0.25] -; SKYLAKE-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00] +; SKYLAKE-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [10:3.00] ; SKYLAKE-NEXT: # kill: %ECX %ECX %RCX ; SKYLAKE-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] @@ -562,8 +562,8 @@ ; ; SKYLAKE-LABEL: test_pcmpistrm: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00] -; SKYLAKE-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:3.00] +; SKYLAKE-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00] +; SKYLAKE-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpistrm: @@ -611,8 +611,8 @@ ; ; SKYLAKE-LABEL: test_pcmpgtq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKYLAKE-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [3:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpgtq: @@ -661,8 +661,8 @@ ; ; SKYLAKE-LABEL: test_pclmulqdq: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [11:2.00] -; SKYLAKE-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [11:2.00] +; SKYLAKE-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [6:1.00] +; SKYLAKE-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pclmulqdq: Index: test/CodeGen/X86/ssse3-schedule.ll =================================================================== --- test/CodeGen/X86/ssse3-schedule.ll +++ test/CodeGen/X86/ssse3-schedule.ll @@ -626,8 +626,8 @@ ; ; SKYLAKE-LABEL: test_pmaddubsw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaddubsw: @@ -682,8 +682,8 @@ ; ; SKYLAKE-LABEL: test_pmulhrsw: ; SKYLAKE: # BB#0: -; SKYLAKE-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SKYLAKE-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SKYLAKE-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKYLAKE-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulhrsw: