Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -23,8 +23,8 @@ // Based on the LSD (loop-stream detector) queue size and benchmarking data. let LoopMicroOpBufferSize = 50; - // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow - // the scheduler to assign a default model to unrecognized opcodes. + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. let CompleteModel = 0; } @@ -436,30 +436,6 @@ // r,m. def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; -// CMOVcc. -// r,r. -def : InstRW<[Write2P0156_Lat2], - (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>; -// r,m. -def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], - (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>; - -// XCHG. -// r,r. -def WriteXCHG : SchedWriteRes<[HWPort0156]> { - let Latency = 2; - let ResourceCycles = [3]; -} - -def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; - -// r,m. -def WriteXCHGrm : SchedWriteRes<[]> { - let Latency = 21; - let NumMicroOps = 8; -} -def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>; - // XLAT. def WriteXLAT : SchedWriteRes<[]> { let Latency = 7; @@ -471,12 +447,6 @@ // m. def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>; -// PUSHF. -def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> { - let NumMicroOps = 4; -} -def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>; - // PUSHA. def WritePushA : SchedWriteRes<[]> { let NumMicroOps = 19; @@ -487,178 +457,14 @@ // m. def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>; -// POPF. -def WritePopF : SchedWriteRes<[]> { - let NumMicroOps = 9; -} -def : InstRW<[WritePopF], (instregex "POPF(16|32)")>; - // POPA. def WritePopA : SchedWriteRes<[]> { let NumMicroOps = 18; } def : InstRW<[WritePopA], (instregex "POPA(16|32)")>; -// LAHF SAHF. -def : InstRW<[WriteP06], (instregex "(S|L)AHF")>; - -// BSWAP. -// r32. -def WriteBSwap32 : SchedWriteRes<[HWPort15]>; -def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>; - -// r64. -def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>; - -// MOVBE. -// r16,m16 / r64,m64. -def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>; - -// r32, m32. -def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>; - -// m16,r16. -def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>; - -// m32,r32. -def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>; - -// m64,r64. -def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> { - let NumMicroOps = 4; -} -def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>; - //-- Arithmetic instructions --// -// ADD SUB. -// m,r/i. -def : InstRW<[Write2P0156_2P237_P4], - (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", - "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>; - -// ADC SBB. -// r,r/i. -def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)", - "(ADC|SBB)(16|32|64)ri8", - "(ADC|SBB)64ri32", - "(ADC|SBB)(8|16|32|64)rr_REV")>; - -// r,m. -def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>; - -// m,r/i. -def : InstRW<[Write3P0156_2P237_P4], - (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", - "(ADC|SBB)(16|32|64)mi8", - "(ADC|SBB)64mi32")>; - -// INC DEC NOT NEG. -// m. -def : InstRW<[WriteP0156_2P237_P4], - (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m", - "(INC|DEC)64(16|32)m")>; - -// MUL IMUL. -// r16. -def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> { - let Latency = 4; - let NumMicroOps = 4; -} -def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>; - -// m16. -def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { - let Latency = 8; - let NumMicroOps = 5; -} -def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>; - -// r32. -def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> { - let Latency = 4; - let NumMicroOps = 3; -} -def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>; - -// m32. -def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { - let Latency = 8; - let NumMicroOps = 4; -} -def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>; - -// r64. -def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> { - let Latency = 3; - let NumMicroOps = 2; -} -def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>; - -// m64. -def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { - let Latency = 7; - let NumMicroOps = 3; -} -def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>; - -// r16,r16. -def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> { - let Latency = 4; - let NumMicroOps = 2; -} -def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>; - -// r16,m16. -def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { - let Latency = 8; - let NumMicroOps = 3; -} -def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>; - -// MULX. -// r32,r32,r32. -def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} -def : InstRW<[WriteMulX32], (instregex "MULX32rr")>; - -// r32,r32,m32. -def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; -} -def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>; - -// r64,r64,r64. -def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> { - let Latency = 4; - let NumMicroOps = 2; -} -def : InstRW<[WriteMulX64], (instregex "MULX64rr")>; - -// r64,r64,m64. -def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { - let Latency = 8; - let NumMicroOps = 3; -} -def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>; - // DIV. // r8. def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { @@ -667,27 +473,6 @@ } def : InstRW<[WriteDiv8], (instregex "DIV8r")>; -// r16. -def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 23; - let NumMicroOps = 10; -} -def : InstRW<[WriteDiv16], (instregex "DIV16r")>; - -// r32. -def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 22; - let NumMicroOps = 10; -} -def : InstRW<[WriteDiv32], (instregex "DIV32r")>; - -// r64. -def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 32; - let NumMicroOps = 36; -} -def : InstRW<[WriteDiv64], (instregex "DIV64r")>; - // IDIV. // r8. def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { @@ -696,259 +481,23 @@ } def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>; -// r16. -def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 23; - let NumMicroOps = 10; -} -def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>; - -// r32. -def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 22; - let NumMicroOps = 9; -} -def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>; - -// r64. -def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 39; - let NumMicroOps = 59; -} -def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>; - -//-- Logic instructions --// - -// AND OR XOR. -// m,r/i. -def : InstRW<[Write2P0156_2P237_P4], - (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", - "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; - -// SHR SHL SAR. -// m,i. -def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; -} -def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; - -// r,cl. -def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>; - -// m,cl. -def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> { - let NumMicroOps = 6; - let ResourceCycles = [3, 2, 1]; -} -def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>; - -// ROR ROL. -// r,1. -def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>; - -// m,i. -def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 5; - let ResourceCycles = [2, 2, 1]; -} -def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>; - -// r,cl. -def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>; - -// m,cl. -def WriteRotateRMWCL : SchedWriteRes<[]> { - let NumMicroOps = 6; -} -def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>; - -// RCR RCL. -// r,1. -def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; -} -def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>; - -// m,1. -def WriteRCm1 : SchedWriteRes<[]> { - let NumMicroOps = 6; -} -def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>; - -// r,i. -def WriteRCri : SchedWriteRes<[HWPort0156]> { - let Latency = 6; - let NumMicroOps = 8; -} -def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>; - -// m,i. -def WriteRCmi : SchedWriteRes<[]> { - let NumMicroOps = 11; -} -def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>; - -// SHRD SHLD. -// r,r,i. -def WriteShDrr : SchedWriteRes<[HWPort1]> { - let Latency = 3; -} -def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>; - -// m,r,i. -def WriteShDmr : SchedWriteRes<[]> { - let NumMicroOps = 5; -} -def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>; - -// r,r,cl. -def WriteShlDCL : SchedWriteRes<[HWPort0156]> { - let Latency = 3; - let NumMicroOps = 4; -} -def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>; - -// r,r,cl. -def WriteShrDCL : SchedWriteRes<[HWPort0156]> { - let Latency = 4; - let NumMicroOps = 4; -} -def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>; - -// m,r,cl. -def WriteShDmrCL : SchedWriteRes<[]> { - let NumMicroOps = 7; -} -def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>; - // BT. -// r,r/i. -def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>; - // m,r. def WriteBTmr : SchedWriteRes<[]> { let NumMicroOps = 10; } def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>; -// m,i. -def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; - // BTR BTS BTC. -// r,r,i. -def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; - // m,r. def WriteBTRSCmr : SchedWriteRes<[]> { let NumMicroOps = 11; } def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>; -// m,i. -def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>; - -// BSF BSR. -// r,r. -def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>; -// r,m. -def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>; - -// SETcc. -// r. -def : InstRW<[WriteShift], - (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>; -// m. -def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteSetCCm], - (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>; - -// CLD STD. -def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>; - -// LZCNT TZCNT. -// r,r. -def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>; -// r,m. -def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>; - -// ANDN. -// r,r. -def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>; -// r,m. -def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>; - -// BLSI BLSMSK BLSR. -// r,r. -def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>; -// r,m. -def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>; - -// BEXTR. -// r,r,r. -def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>; -// r,m,r. -def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>; - -// BZHI. -// r,r,r. -def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>; - -// PDEP PEXT. -// r,r,r. -def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; - //-- Control transfer instructions --// -// J(E|R)CXZ. -def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>; - -// LOOP. -def WriteLOOP : SchedWriteRes<[]> { - let NumMicroOps = 7; -} -def : InstRW<[WriteLOOP], (instregex "LOOP")>; - -// LOOP(N)E -def WriteLOOPE : SchedWriteRes<[]> { - let NumMicroOps = 11; -} -def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>; - // CALL. -// r. -def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>; - -// m. -def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; -} -def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>; - -// RET. -def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>; - // i. def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> { let NumMicroOps = 4; @@ -977,12 +526,6 @@ // LODSD/Q. def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>; -// STOS. -def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>; - // MOVS. def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> { let Latency = 4; @@ -1002,57 +545,9 @@ } def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>; -//-- Synchronization instructions --// - -// XADD. -def WriteXADD : SchedWriteRes<[]> { - let NumMicroOps = 5; -} -def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>; - -// CMPXCHG. -def WriteCMPXCHG : SchedWriteRes<[]> { - let NumMicroOps = 6; -} -def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>; - -// CMPXCHG8B. -def WriteCMPXCHG8B : SchedWriteRes<[]> { - let NumMicroOps = 15; -} -def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>; - -// CMPXCHG16B. -def WriteCMPXCHG16B : SchedWriteRes<[]> { - let NumMicroOps = 22; -} -def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>; - //-- Other --// -// PAUSE. -def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> { - let NumMicroOps = 5; - let ResourceCycles = [1, 3]; -} -def : InstRW<[WritePAUSE], (instregex "PAUSE")>; - -// LEAVE. -def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>; - -// XGETBV. -def WriteXGETBV : SchedWriteRes<[]> { - let NumMicroOps = 8; -} -def : InstRW<[WriteXGETBV], (instregex "XGETBV")>; - -// RDTSC. -def WriteRDTSC : SchedWriteRes<[]> { - let NumMicroOps = 15; -} -def : InstRW<[WriteRDTSC], (instregex "RDTSC")>; - -// RDPMC. +// RDPMC.f def WriteRDPMC : SchedWriteRes<[]> { let NumMicroOps = 34; } @@ -1072,13 +567,6 @@ // m80. def : InstRW<[WriteP01], (instregex "LD_Frr")>; -def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 4; - let NumMicroOps = 4; - let ResourceCycles = [2, 2]; -} -def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>; - // FBLD. // m80. def WriteFBLD : SchedWriteRes<[]> { @@ -1091,86 +579,14 @@ // r. def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>; -// m80. -def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> { - let NumMicroOps = 7; - let ResourceCycles = [3, 2, 2]; -} -def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>; - -// FBSTP. -// m80. -def WriteFBSTP : SchedWriteRes<[]> { - let NumMicroOps = 226; -} -def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>; - -// FXCHG. -def : InstRW<[WriteNop], (instregex "XCH_F")>; - -// FILD. -def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 6; - let NumMicroOps = 2; -} -def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>; - -// FIST(P) FISTTP. -def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> { - let Latency = 7; - let NumMicroOps = 3; -} -def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>; - // FLDZ. def : InstRW<[WriteP01], (instregex "LD_F0")>; -// FLD1. -def : InstRW<[Write2P01], (instregex "LD_F1")>; - // FLDPI FLDL2E etc. def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>; -// FCMOVcc. -def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; -} -def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>; - -// FNSTSW. -// AX. -def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>; - -// m16. -def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> { - let Latency = 6; - let NumMicroOps = 3; -} -def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>; - -// FLDCW. -def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> { - let Latency = 7; - let NumMicroOps = 3; -} -def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>; - -// FNSTCW. -def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>; - -// FINCSTP FDECSTP. -def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>; - -// FFREE. -def : InstRW<[WriteP01], (instregex "FFREE")>; +// FFREE. +def : InstRW<[WriteP01], (instregex "FFREE")>; // FNSAVE. def WriteFNSAVE : SchedWriteRes<[]> { @@ -1192,13 +608,6 @@ // FCHS. def : InstRW<[WriteP0], (instregex "CHS_F")>; -// FCOM(P) FUCOM(P). -// r. -def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr", - "UCOM_FPr")>; -// m. -def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>; - // FCOMPP FUCOMPP. // r. def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>; @@ -1208,9 +617,6 @@ def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr", "UCOM_FIPr")>; -// FICOM(P). -def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>; - // FTST. def : InstRW<[WriteP1], (instregex "TST_F")>; @@ -1272,66 +678,6 @@ def : InstRW<[WriteFNINIT], (instregex "FNINIT")>; //=== Integer MMX and XMM Instructions ===// -//-- Move instructions --// - -// MOVD. -// r32/64 <- (x)mm. -def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr", - "VMOVPDI2DIrr", "MOVPDI2DIrr")>; - -// (x)mm <- r32/64. -def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr", - "VMOVDI2PDIrr", "MOVDI2PDIrr")>; - -// MOVQ. -// r64 <- (x)mm. -def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>; - -// (x)mm <- r64. -def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>; - -// (x)mm <- (x)mm. -def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>; - -// (V)MOVDQA/U. -// x <- x. -def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr", - "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV", - "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>; - -// MOVDQ2Q. -def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>; - -// MOVQ2DQ. -def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>; - - -// PACKSSWB/DW. -// mm <- mm. -def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [3]; -} -def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr", - "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>; - -// mm <- m64. -def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [1, 3]; -} -def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm", - "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>; - -// VPMOVSX/ZX BW BD BQ DW DQ. -// y <- x. -def WriteVPMOVSX : SchedWriteRes<[HWPort5]> { - let Latency = 3; - let NumMicroOps = 1; -} -def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>; // PBLENDW. // x,x,i / v,v,v,i @@ -1346,94 +692,12 @@ } def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>; -// VPBLENDD. -// v,v,v,i. -def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>; -def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>; - -// v,v,m,i -def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> { - let NumMicroOps = 2; - let Latency = 4; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>; - -// MASKMOVQ. -def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> { - let Latency = 13; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 2]; -} -def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>; - -// MASKMOVDQU. -def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> { - let Latency = 14; - let NumMicroOps = 10; - let ResourceCycles = [4, 2, 4]; -} -def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>; - -// VPMASKMOV D/Q. -// v,v,m. -def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; -} -def : InstRW<[WriteVPMASKMOVr, ReadAfterLd], - (instregex "VPMASKMOV(D|Q)(Y?)rm")>; - -// m, v,v. -def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { - let Latency = 13; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; -} -def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; - // PMOVMSKB. def WritePMOVMSKB : SchedWriteRes<[HWPort0]> { let Latency = 3; } def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>; -// PEXTR B/W/D/Q. -// r32,x,i. -def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>; - -// m8,x,i. -def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> { - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; -} -def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>; - -// VPBROADCAST B/W. -// x, m8/16. -def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; -} -def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd], - (instregex "VPBROADCAST(B|W)rm")>; - -// y, m8/16 -def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; -} -def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd], - (instregex "VPBROADCAST(B|W)Yrm")>; - // VPGATHERDD. // x. def WriteVPGATHERDD128 : SchedWriteRes<[]> { @@ -1521,198 +785,7 @@ let ResourceCycles = [1, 2, 1]; } -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} -def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64", - "MMX_PHADDSWrr64", - "MMX_PHSUB(W|D)rr64", - "MMX_PHSUBSWrr64", - "(V?)PH(ADD|SUB)(W|D)(Y?)rr", - "(V?)PH(ADD|SUB)SWrr(256)?")>; - -// v <- v,m. -def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 2, 1]; -} -def : InstRW<[WritePHADDSUBm, ReadAfterLd], - (instregex "MMX_PHADD(W?)rm64", - "MMX_PHADDSWrm64", - "MMX_PHSUB(W|D)rm64", - "MMX_PHSUBSWrm64", - "(V?)PH(ADD|SUB)(W|D)(Y?)rm", - "(V?)PH(ADD|SUB)SWrm(128|256)?")>; - -// PCMPGTQ. -// v <- v,v. -def WritePCMPGTQr : SchedWriteRes<[HWPort0]> { - let Latency = 5; - let NumMicroOps = 1; -} -def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; - -// v <- v,m. -def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>; - -// PMULLD. -// x,x / y,y,y. -def WritePMULLDr : SchedWriteRes<[HWPort0]> { - let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>; - -// x,m / y,y,m. -def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 10; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; -} -def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>; - -//-- Logic instructions --// - -// PTEST. -// v,v. -def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>; - -// v,m. -def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; -} -def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>; - -// PSLL,PSRL,PSRA W/D/Q. -// x,x / v,v,x. -def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>; - -// PSLL,PSRL DQ. -def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>; - -//-- Other --// - -// EMMS. -def WriteEMMS : SchedWriteRes<[]> { - let Latency = 13; - let NumMicroOps = 31; -} -def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>; - //=== Floating Point XMM and YMM Instructions ===// -//-- Move instructions --// - -// MOVMSKP S/D. -// r32 <- x. -def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> { - let Latency = 3; -} -def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>; - -// r32 <- y. -def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> { - let Latency = 2; -} -def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>; - -// VPERM2F128. -def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>; -def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>; - -// BLENDVP S/D. -def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>; -def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>; - -// VBROADCASTF128. -def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>; - -// EXTRACTPS. -// r32,x,i. -def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> { - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; - -// m32,x,i. -def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; -} -def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; - -// VEXTRACTF128. -// x,y,i. -def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>; - -// m128,y,i. -def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> { - let Latency = 4; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>; - -// VINSERTF128. -// y,y,x,i. -def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>; - -// y,y,m128,i. -def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> { - let Latency = 4; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; -} -def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>; - -// VMASKMOVP S/D. -// v,v,m. -def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; -} -def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>; - -// m128,x,x. -def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { - let Latency = 13; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; -} -def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>; - -// m256,y,y. -def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { - let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; -} -def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>; // VGATHERDPS. // x. @@ -1766,415 +839,3613 @@ } def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>; -//-- Conversion instructions --// +// Remaining instrs. -// CVTPD2PS. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>; - -// x,m128. -def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>; +def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup0], (instregex "LDDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "LD_F32m")>; +def: InstRW<[HWWriteResGroup0], (instregex "LD_F64m")>; +def: InstRW<[HWWriteResGroup0], (instregex "LD_F80m")>; +def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64rm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVD64to64rm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MMX_MOVQ64rm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOV(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOV64toPQIrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOV8rm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVAPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVAPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVDDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVDI2PDIrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVNTDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSHDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSLDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSX(16|32|64)rm16")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSX(16|32|64)rm32")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSX(16|32|64)rm8")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVUPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVUPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVZX(16|32|64)rm16")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVZX(16|32|64)rm8")>; +def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHNTA")>; +def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT0")>; +def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT1")>; +def: InstRW<[HWWriteResGroup0], (instregex "PREFETCHT2")>; +def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTF128")>; +def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTI128")>; +def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOV64toPQIrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDDUPYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQAYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQAYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQYrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQrm")>; -// x,y. -def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> { - let Latency = 5; +def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> { + let Latency = 1; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>; +def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVNTQmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVQ64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOV(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOV8mi")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOV8mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVAPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVAPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVDQAmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVDQUmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVHPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVHPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVLPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVLPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTDQmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTI_64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVPDI2DImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVPQI2QImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVPQIto64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVSSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVUPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVUPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "ST_FP32m")>; +def: InstRW<[HWWriteResGroup1], (instregex "ST_FP64m")>; +def: InstRW<[HWWriteResGroup1], (instregex "ST_FP80m")>; +def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTF128mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTI128mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVPDI2DImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQI2QImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQIto64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVSDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVSSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMPTRSTm")>; -// x,m256. -def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 9; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; +def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64grr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PMOVMSKBrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MOVPDI2DIrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MOVPQIto64rr")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSLLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSLLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSLLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRADri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRAWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VMOVPQIto64rr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRADYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRADri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSrr")>; -// CVTSD2SS. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>; - -// x,m64. -def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>; - -// CVTPS2PD. -// x,x. -def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>; +def: InstRW<[HWWriteResGroup3], (instregex "COMP_FST0r")>; +def: InstRW<[HWWriteResGroup3], (instregex "COM_FST0r")>; +def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[HWWriteResGroup3], (instregex "UCOM_FPr")>; +def: InstRW<[HWWriteResGroup3], (instregex "UCOM_Fr")>; +def: InstRW<[HWWriteResGroup3], (instregex "VMASKMOVDQU")>; -// x,m64. -// y,m128. -def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>; +def: InstRW<[HWWriteResGroup4], (instregex "ANDNPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ANDNPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ANDPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ANDPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "INSERTPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64rr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64to64rr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOV64toPQIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVAPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVAPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVDDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKSSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKUSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKUSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PALIGNRrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PBLENDWrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFDri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFHWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFLWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSLLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSRLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "SHUFPDrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "SHUFPSrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVDI2PDIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV?)")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "XORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "XORPSrr")>; -// y,x. -def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[HWWriteResGroup5], (instregex "JMP(16|32|64)r")>; -// CVTSS2SD. -// x,x. -def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>; +def: InstRW<[HWWriteResGroup6], (instregex "FINCSTP")>; +def: InstRW<[HWWriteResGroup6], (instregex "FNOP")>; -// x,m32. -def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>; +def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "CDQ")>; +def: InstRW<[HWWriteResGroup7], (instregex "CQO")>; +def: InstRW<[HWWriteResGroup7], (instregex "JAE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JAE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JA_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JA_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JBE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JBE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JB_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JB_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JGE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JGE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JG_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JG_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JLE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JLE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JL_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JL_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JMP_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JMP_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNO_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNO_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNP_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNP_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNS_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNS_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JO_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JO_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JP_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JP_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JS_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JS_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "RORX32ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "RORX64ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR8r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR8ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SARX32rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SARX64rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETAEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETBr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETGEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETGr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETLEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETLr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNOr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNPr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNSr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETOr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETPr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETSr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL8r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL8ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHLX32rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHLX64rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR8r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR8ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHRX32rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHRX64rr")>; -// CVTDQ2PD. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>; - -// y,x. -def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>; - -// CVT(T)PD2DQ. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>; -// x,m128. -def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>; -// x,y. -def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>; -// x,m256. -def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>; - -// CVT(T)PS2PI. -// mm,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>; - -// CVTPI2PD. -// x,mm. -def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>; - -// CVT(T)PD2PI. -// mm,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>; - -// CVSTSI2SS. -// x,r32. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>; - -// CVT(T)SS2SI. -// r32,x. -def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>; -// r32,m32. -def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>; - -// CVTSI2SD. -// x,r32/64. -def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>; - -// CVTSD2SI. -// r32/64 -def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>; -// r32,m32. -def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>; - -// VCVTPS2PH. -// x,v,i. -def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>; -// m,v,i. -def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>; - -// VCVTPH2PS. -// v,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>; - -//-- Arithmetic instructions --// - -// HADD, HSUB PS/PD -// x,x / v,v,v. -def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> { - let Latency = 5; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; +def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "ANDN32rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "ANDN64rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSI32rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSI64rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK32rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK64rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSR32rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSR64rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BZHI32rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BZHI64rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "LEA(16|32|64)r")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSBrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSDrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSWrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDQirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXUBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINUBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBQirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PABSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PABSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PABSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PAVGBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PAVGWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSIGNBrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSIGNDrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSIGNWrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDQYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBYrr256")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDYrr256")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWYrr256")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWrr")>; -// x,m / v,v,m. -def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; +def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>; +def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV?)")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV?)")>; +def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV?)")>; +def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PORrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PXORrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDYrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV?)")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV?)")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV?)")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV?)")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDNrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDYrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPORYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPORrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPXORYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPXORrr")>; -// MULL SS/SD PS/PD. -// x,x / v,v,v. -def WriteMULr : SchedWriteRes<[HWPort01]> { - let Latency = 5; +def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "CBW")>; +def: InstRW<[HWWriteResGroup10], (instregex "CLC")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMC")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>; +def: InstRW<[HWWriteResGroup10], (instregex "DEC(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "INC(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>; +def: InstRW<[HWWriteResGroup10], (instregex "NEG(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "NEG8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "NOOP")>; +def: InstRW<[HWWriteResGroup10], (instregex "NOT(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>; +def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>; +def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>; +def: InstRW<[HWWriteResGroup10], (instregex "SLDT64m")>; +def: InstRW<[HWWriteResGroup10], (instregex "SMSW16m")>; +def: InstRW<[HWWriteResGroup10], (instregex "STC")>; +def: InstRW<[HWWriteResGroup10], (instregex "STRm")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST8rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "XCHG(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR8rr")>; -// x,m / v,v,m. -def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 9; +def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 1; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup11], (instregex "CVTPS2PDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "CVTSS2SDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLQrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLWrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRADrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRAWrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLQrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLWrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VCVTPS2PDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VCVTSS2SDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSLLDYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSLLQYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSLLVQYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSLLVQrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSLLWYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSRADYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSRAWYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSRLDYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSRLQYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSRLVQYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSRLVQrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VPSRLWYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VTESTPDYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VTESTPDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VTESTPSYrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VTESTPSrm")>; + +def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup12], (instregex "FCOM32m")>; +def: InstRW<[HWWriteResGroup12], (instregex "FCOM64m")>; +def: InstRW<[HWWriteResGroup12], (instregex "FCOMP32m")>; +def: InstRW<[HWWriteResGroup12], (instregex "FCOMP64m")>; + +def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup13], (instregex "ANDNPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ANDNPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ANDPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ANDPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "INSERTPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PINSRWirmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PSHUFWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKHBWirm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKHDQirm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKHWDirm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKLBWirm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKLDQirm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MMX_PUNPCKLWDirm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MOVHPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MOVHPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MOVLPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "MOVLPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ORPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKSSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKSSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKUSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKUSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PALIGNRrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PBLENDWrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PINSRBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PINSRDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PINSRQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PINSRWrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXBDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXBQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVSXWQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXBDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXBQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PMOVZXWQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFDmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFHWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFLWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "SHUFPDrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "SHUFPSrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDPDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDPSYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VINSERTPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VMOVHPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VMOVHPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VMOVLPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VMOVLPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VORPDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VORPSYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VORPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRYrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWYrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDYmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSYmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPINSRBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPINSRDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPINSRQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPINSRWrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXBDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXBQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVSXWQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXBDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXBQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPMOVZXWQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDYmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWYmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWYmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDYrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSYrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VXORPDYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VXORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VXORPSYrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VXORPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "XORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "XORPSrm")>; + +def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64")>; +def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>; + +def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup15], (instregex "BT(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup15], (instregex "RORX32mi")>; +def: InstRW<[HWWriteResGroup15], (instregex "RORX64mi")>; +def: InstRW<[HWWriteResGroup15], (instregex "SARX32rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SARX64rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHLX32rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHLX64rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHRX32rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHRX64rm")>; + +def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup16], (instregex "ANDN32rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "ANDN64rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSI32rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSI64rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK32rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK64rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSR32rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSR64rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BZHI32rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BZHI64rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSBrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSDrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSWrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDQirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXUBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINUBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNWrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBQirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MOVBE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PABSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PABSDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PABSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDQrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDUSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDUSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PADDWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PAVGBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PAVGWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQQrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PCMPEQWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PCMPGTBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PCMPGTDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PCMPGTWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMAXSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMAXSDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMAXSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMAXUBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMAXUDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMAXUWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMINSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMINSDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMINSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMINUBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMINUDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PMINUWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSIGNBrm128")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSIGNDrm128")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSIGNWrm128")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBQrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBUSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBUSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "PSUBWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPABSBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPABSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPABSDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPABSDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPABSWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPABSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDQYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDQrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDSBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDSWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDUSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPADDWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPAVGBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPAVGBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPAVGWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPAVGWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQQYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQQrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPEQWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPCMPGTWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMAXUWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINSBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINSDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINSDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINSWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINUBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINUBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINUDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINUDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINUWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPMINUWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNBYrm256")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNBrm128")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNDYrm256")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNDrm128")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNWYrm256")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSIGNWrm128")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBDYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBDrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBQYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBQrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSBYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSBrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBUSWrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBWYrm")>; +def: InstRW<[HWWriteResGroup16], (instregex "VPSUBWrm")>; + +def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup17], (instregex "BLENDPDrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "BLENDPSrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "MMX_PANDNirm")>; +def: InstRW<[HWWriteResGroup17], (instregex "MMX_PANDirm")>; +def: InstRW<[HWWriteResGroup17], (instregex "MMX_PORirm")>; +def: InstRW<[HWWriteResGroup17], (instregex "MMX_PXORirm")>; +def: InstRW<[HWWriteResGroup17], (instregex "PANDNrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "PANDrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "PORrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "PXORrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDYrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSYrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VINSERTI128rm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPANDNYrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPANDNrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPANDYrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPANDrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDYrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPORYrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPORrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPXORYrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPXORrm")>; + +def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup18], (instregex "ADD(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "ADD8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "AND(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "AND8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP8mi")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP8mr")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "OR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "OR8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr?)")>; +def: InstRW<[HWWriteResGroup18], (instregex "SUB(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "SUB8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "TEST(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "TEST8mi")>; +def: InstRW<[HWWriteResGroup18], (instregex "TEST8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "XOR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "XOR8rm")>; + +def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup19], (instregex "SFENCE")>; + +def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup20], (instregex "EXTRACTPSmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRBmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRDmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRQmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRWmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "STMXCSR")>; +def: InstRW<[HWWriteResGroup20], (instregex "VEXTRACTPSmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRBmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRDmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRQmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRWmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VSTMXCSR")>; + +def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup21], (instregex "FNSTCW16m")>; + +def HWWriteResGroup22 : SchedWriteRes<[HWPort4,HWPort237,HWPort06]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup22], (instregex "SETAEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETBm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETGEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETGm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETLEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETLm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNOm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNPm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNSm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETOm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETPm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETSm")>; + +def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup23], (instregex "MOVBE(32|64)mr")>; + +def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup23_16], (instregex "MOVBE16mr")>; + +def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> { + let Latency = 1; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr?)")>; +def: InstRW<[HWWriteResGroup24], (instregex "PUSH64i8")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSB")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSL")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSQ")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSW")>; + +def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { + let Latency = 1; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup25], (instregex "BTR(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup25], (instregex "BTS(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR8m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR8mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL8m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL8mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR8m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR8mi")>; + +def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 1; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "ADD8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "ADD8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "DEC(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "DEC8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "INC(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "INC8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NEG(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NEG8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NOT(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NOT8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR8mr")>; + +def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPDrr0")>; +def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPSrr0")>; +def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWirri")>; +def: InstRW<[HWWriteResGroup27], (instregex "PBLENDVBrr0")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRBrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRDrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRQrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRWrri")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDYrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSYrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBYrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRBrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRDrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRQrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRWrri")>; + +def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup28], (instregex "FDECSTP")>; + +def HWWriteResGroup29 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROL8r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROL8ri")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR8r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR8ri")>; + +def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup30], (instregex "LFENCE")>; +def: InstRW<[HWWriteResGroup30], (instregex "MFENCE")>; +def: InstRW<[HWWriteResGroup30], (instregex "WAIT")>; +def: InstRW<[HWWriteResGroup30], (instregex "XGETBV")>; + +def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup31], (instregex "CVTPS2PDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "CVTSS2SDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "EXTRACTPSrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "MMX_PEXTRWirri")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRBrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWri")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWrr_REV")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSLLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSLLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSLLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRADrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRAWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PTESTrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTPS2PDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTSS2SDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VEXTRACTPSrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRBrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWri")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWrr_REV")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSLLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSLLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSLLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRADrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRAWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPTESTrr")>; + +def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup32], (instregex "CLFLUSH")>; + +def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>; + +def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup34], (instregex "BEXTR32rr")>; +def: InstRW<[HWWriteResGroup34], (instregex "BEXTR64rr")>; +def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>; + +def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC8i8")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC8ri")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVAE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVB(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVG(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVGE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVL(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVLE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNO(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNP(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNS(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVO(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVP(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVS(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CWD")>; +def: InstRW<[HWWriteResGroup35], (instregex "JRCXZ")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV?)")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB8i8")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB8ri")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV?)")>; +def: InstRW<[HWWriteResGroup35], (instregex "SETAr")>; +def: InstRW<[HWWriteResGroup35], (instregex "SETBEr")>; + +def HWWriteResGroup36 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPDrm0")>; +def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPSrm0")>; +def: InstRW<[HWWriteResGroup36], (instregex "MMX_PACKSSDWirm")>; +def: InstRW<[HWWriteResGroup36], (instregex "MMX_PACKSSWBirm")>; +def: InstRW<[HWWriteResGroup36], (instregex "MMX_PACKUSWBirm")>; +def: InstRW<[HWWriteResGroup36], (instregex "PBLENDVBrm0")>; +def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDYrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSYrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSYrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBYrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDYrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQYrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQrm")>; + +def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup37], (instregex "LEAVE64")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASB")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASL")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASQ")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASW")>; + +def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup38], (instregex "PSLLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSLLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSLLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRADrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRAWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PTESTrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSLLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSLLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSLLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRADrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRAWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPTESTrm")>; + +def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup39], (instregex "FLDCW16m")>; + +def HWWriteResGroup40 : SchedWriteRes<[HWPort0,HWPort23,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup40], (instregex "LDMXCSR")>; +def: InstRW<[HWWriteResGroup40], (instregex "VLDMXCSR")>; + +def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup41], (instregex "LRETQ")>; +def: InstRW<[HWWriteResGroup41], (instregex "RETQ")>; + +def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup42], (instregex "BEXTR32rm")>; +def: InstRW<[HWWriteResGroup42], (instregex "BEXTR64rm")>; + +def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup43], (instregex "ADC(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "ADC8rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVAE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVB(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVG(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVGE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVL(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVLE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNO(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNP(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNS(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVO(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVP(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVS(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "SBB(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "SBB8rm")>; + +def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup44], (instregex "CALL(16|32|64)r")>; + +def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup45], (instregex "CALL64pcrel32")>; +def: InstRW<[HWWriteResGroup45], (instregex "SETAm")>; +def: InstRW<[HWWriteResGroup45], (instregex "SETBEm")>; + +def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROL8m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROL8mi")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR8m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR8mi")>; + +def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup47], (instregex "XADD(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup47], (instregex "XADD8rm")>; + +def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>; +def: InstRW<[HWWriteResGroup48], (instregex "FARCALL64")>; + +def HWWriteResGroup49 : SchedWriteRes<[HWPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPDrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPSrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "PMOVMSKBrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBYrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBrr")>; + +def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup50], (instregex "ADDPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADD_FPrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADD_FST0r")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADD_FrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "BSF(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "BSR(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CMPPDrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "CMPPSrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "CMPSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "COMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "COMISSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8?)")>; +def: InstRW<[HWWriteResGroup50], (instregex "IMUL8r")>; +def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAXPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAXPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAXSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAXSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MINPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MINPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MINSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MINSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MUL8r")>; +def: InstRW<[HWWriteResGroup50], (instregex "PDEP32rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "PDEP64rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "PEXT32rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "PEXT64rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "POPCNT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[HWWriteResGroup50], (instregex "SHRD(16|32|64)rri8")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FPrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FST0r")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUB_FPrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUB_FST0r")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUB_FrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "TZCNT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "UCOMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "UCOMISSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDYrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSYrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCOMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCOMISSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAXPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAXPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAXPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAXPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAXSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAXSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMINPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMINPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMINPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMINPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMINSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMINSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrr")>; + +def HWWriteResGroup50_16 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; +} +def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8?)")>; + +def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; +} +def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8?)")>; + +def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSSYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTF128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTI128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VINSERTF128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VINSERTI128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERM2F128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERM2I128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMPDYri")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMPSYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMQYri")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBWYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXDQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBWYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXDQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWQYrr")>; + +def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup52], (instregex "ADDPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADD_F32m")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADD_F64m")>; +def: InstRW<[HWWriteResGroup52], (instregex "BSF(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "BSR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "CMPPDrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "CMPPSrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "CMPSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "COMISDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "COMISSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "CVTDQ2PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "CVTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "CVTTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ILD_F16m")>; +def: InstRW<[HWWriteResGroup52], (instregex "ILD_F32m")>; +def: InstRW<[HWWriteResGroup52], (instregex "ILD_F64m")>; +def: InstRW<[HWWriteResGroup52], (instregex "IMUL64m")>; +def: InstRW<[HWWriteResGroup52], (instregex "IMUL64rm(i8?)")>; +def: InstRW<[HWWriteResGroup52], (instregex "IMUL8m")>; +def: InstRW<[HWWriteResGroup52], (instregex "LZCNT(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MAXPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MAXPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MAXSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MAXSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MINPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MINPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MINSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MINSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MUL64m")>; +def: InstRW<[HWWriteResGroup52], (instregex "MUL8m")>; +def: InstRW<[HWWriteResGroup52], (instregex "PDEP32rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "PDEP64rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "PEXT32rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "PEXT64rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "POPCNT(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBR_F32m")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBR_F64m")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUB_F32m")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUB_F64m")>; +def: InstRW<[HWWriteResGroup52], (instregex "TZCNT(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup52], (instregex "UCOMISDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "UCOMISSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDPDYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDPSYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDYrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSYrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCOMISDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCOMISSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAXPDYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAXPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAXPSYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAXPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAXSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAXSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMINPDYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMINPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMINPSYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMINPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMINSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMINSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSYrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBSDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBSSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VUCOMISDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VUCOMISSrm")>; + +def HWWriteResGroup52_16 : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 3; + let NumMicroOps = 4; +} +def: InstRW<[HWWriteResGroup52_16], (instregex "IMUL16m")>; +def: InstRW<[HWWriteResGroup52_16], (instregex "IMUL16rm(i8?)")>; +def: InstRW<[HWWriteResGroup52_16], (instregex "MUL16m")>; + +def HWWriteResGroup52_32 : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 3; + let NumMicroOps = 3; +} +def: InstRW<[HWWriteResGroup52_32], (instregex "IMUL32m")>; +def: InstRW<[HWWriteResGroup52_32], (instregex "IMUL32rm(i8?)")>; +def: InstRW<[HWWriteResGroup52_32], (instregex "MUL32m")>; + +def HWWriteResGroup53 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERM2I128rm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMDYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMPDYmi")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMPSYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMQYmi")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXBDYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXBQYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXBWYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXDQYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXWDYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVSXWQYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBQYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBWYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXDQYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWDYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWQYrm")>; + +def HWWriteResGroup54 : SchedWriteRes<[HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[HWWriteResGroup54], (instregex "XADD(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup54], (instregex "XADD8rr")>; +def: InstRW<[HWWriteResGroup54], (instregex "XCHG8rr")>; + +def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDYrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDYrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDYrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDrr")>; + +def HWWriteResGroup56 : SchedWriteRes<[HWPort5,HWPort15]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBSWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHADDDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHADDSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHADDWrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHSUBDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHSUBSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHSUBWrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr256")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr256")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWrr")>; + +def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr")>; +def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSWBirr")>; +def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKUSWBirr")>; + +def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup58], (instregex "CLD")>; + +def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup59], (instregex "CMOVA(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup59], (instregex "CMOVBE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL8r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL8ri")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR8r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR8ri")>; + +def HWWriteResGroup60 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup60], (instregex "ROL(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "ROL8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "ROR(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "ROR8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SAR(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SAR8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHL(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHL8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHR(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHR8rCL")>; + +def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup61], (instregex "FNSTSWm")>; + +def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP16m")>; +def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP32m")>; +def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP64m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_F16m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_F32m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_FP16m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_FP32m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_FP64m")>; + +def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDrm")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDYrm")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDrm")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDYrm")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDrm")>; + +def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBSWrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBWrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "PHADDDrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "PHADDSWrm128")>; +def: InstRW<[HWWriteResGroup64], (instregex "PHADDWrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "PHSUBDrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "PHSUBSWrm128")>; +def: InstRW<[HWWriteResGroup64], (instregex "PHSUBWrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHADDDYrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHADDDrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHADDSWrm128")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHADDSWrm256")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHADDWYrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHADDWrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBDYrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBDrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBSWrm128")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBSWrm256")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBWYrm")>; +def: InstRW<[HWWriteResGroup64], (instregex "VPHSUBWrm")>; + +def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[HWWriteResGroup65], (instregex "CMOVA(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup65], (instregex "CMOVBE(16|32|64)rm")>; + +def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCL8m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCL8mi")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR8m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR8mi")>; + +def HWWriteResGroup67 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[HWWriteResGroup67], (instregex "ROR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup67], (instregex "ROR8mCL")>; + +def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[HWWriteResGroup68], (instregex "ADC(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup68], (instregex "ADC8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "ADD8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "AND8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "OR8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "SUB8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "XCHG(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup68], (instregex "XCHG8rm")>; +def: InstRW<[HWWriteResGroup68], (instregex "XOR8mi")>; + +def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[HWWriteResGroup69], (instregex "ADC(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "ADC8mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG8rm")>; +def: InstRW<[HWWriteResGroup69], (instregex "ROL(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "ROL8mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SAR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SAR8mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB8mi")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB8mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHL(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHL8mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHR8mCL")>; + +def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SIrr")>; + +def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSLLDYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSLLQYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSLLWYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRADYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRAWYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRLDYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRLQYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRLWYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPTESTYrr")>; + +def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup72], (instregex "FNSTSW16r")>; + +def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup73], (instregex "CVTDQ2PDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2DQrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2PSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSD2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SD64rr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTTPD2DQrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTDQ2PDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2DQrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2PSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTPS2PHrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSD2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SD64rr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTTPD2DQrr")>; + +def HWWriteResGroup74 : SchedWriteRes<[HWPort1,HWPort6]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup74], (instregex "IMUL64r")>; +def: InstRW<[HWWriteResGroup74], (instregex "MUL64r")>; +def: InstRW<[HWWriteResGroup74], (instregex "MULX64rr")>; + +def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def: InstRW<[HWWriteResGroup74_16], (instregex "IMUL16r")>; +def: InstRW<[HWWriteResGroup74_16], (instregex "MUL16r")>; + +def HWWriteResGroup74_32 : SchedWriteRes<[HWPort1,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 3; +} +def: InstRW<[HWWriteResGroup74_32], (instregex "IMUL32r")>; +def: InstRW<[HWWriteResGroup74_32], (instregex "MUL32r")>; + +def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup75], (instregex "FICOM16m")>; +def: InstRW<[HWWriteResGroup75], (instregex "FICOM32m")>; +def: InstRW<[HWWriteResGroup75], (instregex "FICOMP16m")>; +def: InstRW<[HWWriteResGroup75], (instregex "FICOMP32m")>; + +def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTTSS2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SIrm")>; + +def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup77], (instregex "VCVTPS2PDYrm")>; +def: InstRW<[HWWriteResGroup77], (instregex "VPTESTYrm")>; + +def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup78], (instregex "CVTDQ2PDrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2DQrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTSD2SSrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTTPD2DQrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTTPD2PIirm")>; +def: InstRW<[HWWriteResGroup78], (instregex "VCVTDQ2PDrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "VCVTSD2SSrm")>; + +def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup79], (instregex "MULX64rm")>; + +def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBYrm")>; +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBrm")>; +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWYrm")>; +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWrm")>; + +def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[HWWriteResGroup81], (instregex "FNCLEX")>; + +def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[HWWriteResGroup82], (instregex "VZEROUPPER")>; + +def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>; + +def HWWriteResGroup84 : SchedWriteRes<[HWPort0,HWPort4,HWPort237,HWPort15]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQmr")>; + +def HWWriteResGroup85 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup85], (instregex "VCVTPS2PHmr")>; + +def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8")>; +def: InstRW<[HWWriteResGroup86], (instregex "SHRD(16|32|64)mri8")>; + +def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup87], (instregex "LSL(16|32|64)rm")>; + +def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[HWWriteResGroup88], (instregex "PUSHF16")>; +def: InstRW<[HWWriteResGroup88], (instregex "PUSHF64")>; + +def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDWDirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHUWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULLWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PSADBWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_FPrST0")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_FST0r")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_FrST0")>; +def: InstRW<[HWWriteResGroup89], (instregex "PCMPGTQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMADDUBSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMADDWDrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULHRSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULHUWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULHWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULLWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULUDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PSADBWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RCPPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RCPSSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RSQRTPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RSQRTSSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRCPPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRCPSSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTSSr")>; + +def HWWriteResGroup90 : SchedWriteRes<[HWPort01]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup90], (instregex "MULPDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "MULPSrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "MULSDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "MULSSrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD132SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD213SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADD231SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB132PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB213PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMADDSUB231PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB132SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB213SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUB231SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD132PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD213PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFMSUBADD231PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD132SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD213SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMADD231SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB132SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB213SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PDYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PSYr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231PSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231SDr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VFNMSUB231SSr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPDYrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPSYrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPSrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULSDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULSSrr")>; + +def HWWriteResGroup91 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDWDirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHUWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULLWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PSADBWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MUL_F32m")>; +def: InstRW<[HWWriteResGroup91], (instregex "MUL_F64m")>; +def: InstRW<[HWWriteResGroup91], (instregex "PCMPGTQrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMADDUBSWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMADDWDrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMULDQrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMULHRSWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMULHUWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMULHWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMULLWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PMULUDQrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "PSADBWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "RCPPSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "RCPSSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "RSQRTPSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "RSQRTSSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPCMPGTQYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPCMPGTQrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMADDUBSWYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMADDUBSWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMADDWDYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMADDWDrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULDQYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULDQrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULHRSWYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULHRSWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULHUWYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULHUWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULHWYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULHWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULLWYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULLWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULUDQYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPMULUDQrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPSADBWYrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VPSADBWrm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VRCPPSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VRCPSSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTPSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTSSm")>; + +def HWWriteResGroup92 : SchedWriteRes<[HWPort01,HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup92], (instregex "MULPDrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "MULPSrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "MULSDrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "MULSSrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD132SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD213SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADD231SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB132PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB213PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMADDSUB231PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB132SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB213SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUB231SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD132PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD213PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFMSUBADD231PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD132SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD213SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMADD231SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB132SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB213SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PDYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PSYm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231PSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231SDm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VFNMSUB231SSm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULPDYrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULPDrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULPSYrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULPSrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULSDrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULSSrm")>; + +def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup93], (instregex "CVTSI2SS64rr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HADDPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HADDPSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HSUBPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HSUBPSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VCVTSI2SS64rr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSrr")>; + +def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>; + +def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup95], (instregex "MULX32rr")>; + +def HWWriteResGroup96 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[HWWriteResGroup96], (instregex "HADDPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "HADDPSrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "HSUBPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "HSUBPSrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDYrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSYrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDYrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSYrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSrm")>; + +def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>; + +def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort23,HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup98], (instregex "MULX32rm")>; + +def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[HWWriteResGroup99], (instregex "PAUSE")>; + +def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[HWWriteResGroup100], (instregex "XSETBV")>; + +def HWWriteResGroup101 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG8rr")>; +def: InstRW<[HWWriteResGroup101], (instregex "ROUNDPDr")>; +def: InstRW<[HWWriteResGroup101], (instregex "ROUNDPSr")>; +def: InstRW<[HWWriteResGroup101], (instregex "ROUNDSDr")>; +def: InstRW<[HWWriteResGroup101], (instregex "ROUNDSSr")>; +def: InstRW<[HWWriteResGroup101], (instregex "VROUNDPDr")>; +def: InstRW<[HWWriteResGroup101], (instregex "VROUNDPSr")>; +def: InstRW<[HWWriteResGroup101], (instregex "VROUNDSDr")>; +def: InstRW<[HWWriteResGroup101], (instregex "VROUNDSSr")>; +def: InstRW<[HWWriteResGroup101], (instregex "VROUNDYPDr")>; +def: InstRW<[HWWriteResGroup101], (instregex "VROUNDYPSr")>; + +def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTPS2PHYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTTPD2DQYrr")>; + +def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI16m")>; +def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI32m")>; +def: InstRW<[HWWriteResGroup103], (instregex "ROUNDPDm")>; +def: InstRW<[HWWriteResGroup103], (instregex "ROUNDPSm")>; +def: InstRW<[HWWriteResGroup103], (instregex "ROUNDSDm")>; +def: InstRW<[HWWriteResGroup103], (instregex "ROUNDSSm")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI16m")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI32m")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI16m")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI32m")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDPDm")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDPSm")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDSDm")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDSSm")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPDm")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPSm")>; + +def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>; +def: InstRW<[HWWriteResGroup104], (instregex "VCVTDQ2PDYrm")>; -// VDIVPS. -// y,y,y. -def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 19; // 18-21 cycles. - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; +def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; } -def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>; +def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[HWWriteResGroup105], (instregex "SHRD(16|32|64)rrCL")>; -// y,y,m256. -def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 23; // 18-21 + 4 cycles. +def HWWriteResGroup106 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> { + let Latency = 6; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup106], (instregex "VCVTPS2PHYmr")>; + +def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup107], (instregex "SLDT(16|32|64)r")>; + +def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[HWWriteResGroup108], (instregex "STD")>; + +def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,1,2]; +} +def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL")>; +def: InstRW<[HWWriteResGroup109], (instregex "SHRD(16|32|64)mrCL")>; + +def HWWriteResGroup110 : SchedWriteRes<[HWPort5]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup110], (instregex "AESDECLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "AESDECrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "AESENCLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "AESENCrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESDECLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESDECrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESENCLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESENCrr")>; + +def HWWriteResGroup111 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "AESDECLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "AESDECrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "AESENCLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "AESENCrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESDECLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESDECrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESENCLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESENCrm")>; -// VDIVPD. -// y,y,y. -def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 27; // 19-35 cycles. +def HWWriteResGroup112 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 7; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [1,2]; } -def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>; +def: InstRW<[HWWriteResGroup112], (instregex "MPSADBWrri")>; +def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWYrri")>; +def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWrri")>; -// y,y,m256. -def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 31; // 19-35 + 4 cycles. +def HWWriteResGroup113 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 7; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [1,2,1]; } -def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>; +def: InstRW<[HWWriteResGroup113], (instregex "MPSADBWrmi")>; +def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWYrmi")>; +def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWrmi")>; -// VRCPPS. -// y,y. -def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> { +def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> { let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [2,2,1,2]; +} +def: InstRW<[HWWriteResGroup114], (instregex "LOOP")>; + +def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 8; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>; +def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI16m")>; +def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI32m")>; -// y,m256. -def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 11; +def HWWriteResGroup116 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup116], (instregex "DPPDrri")>; +def: InstRW<[HWWriteResGroup116], (instregex "VDPPDrri")>; + +def HWWriteResGroup117 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 9; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [1,1,1,1]; } -def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>; +def: InstRW<[HWWriteResGroup117], (instregex "DPPDrmi")>; +def: InstRW<[HWWriteResGroup117], (instregex "VDPPDrmi")>; -// ROUND SS/SD PS/PD. -// v,v,i. -def WriteROUNDr : SchedWriteRes<[HWPort1]> { - let Latency = 6; +def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> { + let Latency = 10; let NumMicroOps = 2; let ResourceCycles = [2]; } -def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>; +def: InstRW<[HWWriteResGroup118], (instregex "PMULLDrr")>; +def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDYrr")>; +def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDrr")>; -// v,m,i. -def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> { +def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> { let Latency = 10; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>; +def: InstRW<[HWWriteResGroup119], (instregex "PMULLDrm")>; +def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDYrm")>; +def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDrm")>; -// DPPS. -// x,x,i / v,v,v,i. -def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { - let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; +def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 10; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,4,1,2]; } -def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>; +def: InstRW<[HWWriteResGroup120], (instregex "RCL(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup120], (instregex "RCL8mCL")>; -// x,m,i / v,v,m,i. -def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> { - let Latency = 18; - let NumMicroOps = 6; - let ResourceCycles = [2, 1, 1, 1, 1]; +def HWWriteResGroup121 : SchedWriteRes<[HWPort0]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>; +def: InstRW<[HWWriteResGroup121], (instregex "DIVPSrr")>; +def: InstRW<[HWWriteResGroup121], (instregex "DIVSSrr")>; -// DPPD. -// x,x,i. -def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { - let Latency = 9; +def HWWriteResGroup122 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup122], (instregex "DIVPSrm")>; +def: InstRW<[HWWriteResGroup122], (instregex "DIVSSrm")>; + +def HWWriteResGroup123 : SchedWriteRes<[HWPort0]> { + let Latency = 11; let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; + let ResourceCycles = [3]; } -def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>; +def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRIrr")>; +def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRM128rr")>; +def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRIrr")>; +def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRM128rr")>; -// x,m,i. -def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> { - let Latency = 13; +def HWWriteResGroup124 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup124], (instregex "PCLMULQDQrr")>; +def: InstRW<[HWWriteResGroup124], (instregex "VPCLMULQDQrr")>; + +def HWWriteResGroup125 : SchedWriteRes<[HWPort0,HWPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup125], (instregex "VRCPPSYr")>; +def: InstRW<[HWWriteResGroup125], (instregex "VRSQRTPSYr")>; + +def HWWriteResGroup126 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 11; let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; + let ResourceCycles = [3,1]; } -def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>; +def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRIrm")>; +def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRM128rm")>; +def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRIrm")>; +def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRM128rm")>; -// VFMADD. -// v,v,v. -def WriteFMADDr : SchedWriteRes<[HWPort01]> { - let Latency = 5; +def HWWriteResGroup127 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup127], (instregex "PCLMULQDQrm")>; +def: InstRW<[HWWriteResGroup127], (instregex "VPCLMULQDQrm")>; + +def HWWriteResGroup128 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup128], (instregex "VRCPPSYm")>; +def: InstRW<[HWWriteResGroup128], (instregex "VRSQRTPSYm")>; + +def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,2,3]; +} +def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup129], (instregex "RCR(16|32|64)rCL")>; + +def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,4,1,3]; +} +def: InstRW<[HWWriteResGroup130], (instregex "RCL8rCL")>; + +def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[HWWriteResGroup131], (instregex "LOOPE")>; +def: InstRW<[HWWriteResGroup131], (instregex "LOOPNE")>; + +def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[HWWriteResGroup132], (instregex "CMPXCHG8B")>; + +def HWWriteResGroup133 : SchedWriteRes<[HWPort0]> { + let Latency = 13; let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteFMADDr], - (instregex - // 3p forms. - "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", - // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r", - // 4s/4s_int forms. - "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", - // 4p forms. - "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>; +def: InstRW<[HWWriteResGroup133], (instregex "SQRTPSr")>; +def: InstRW<[HWWriteResGroup133], (instregex "SQRTSSr")>; +def: InstRW<[HWWriteResGroup133], (instregex "VDIVPSrr")>; +def: InstRW<[HWWriteResGroup133], (instregex "VDIVSSrr")>; -// v,v,m. -def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 9; +def HWWriteResGroup134 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 13; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteFMADDm], - (instregex - // 3p forms. - "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", - // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m", - // 4s/4s_int forms. - "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", - // 4p forms. - "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>; +def: InstRW<[HWWriteResGroup134], (instregex "SQRTPSm")>; +def: InstRW<[HWWriteResGroup134], (instregex "SQRTSSm")>; +def: InstRW<[HWWriteResGroup134], (instregex "VDIVPSrm")>; +def: InstRW<[HWWriteResGroup134], (instregex "VDIVSSrm")>; -//-- Math instructions --// +def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 13; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,3,1,3]; +} +def: InstRW<[HWWriteResGroup135], (instregex "RCR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup135], (instregex "RCR8mCL")>; -// VSQRTPS. -// y,y. -def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 19; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; +def HWWriteResGroup136 : SchedWriteRes<[HWPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>; +def: InstRW<[HWWriteResGroup136], (instregex "DIVPDrr")>; +def: InstRW<[HWWriteResGroup136], (instregex "DIVSDrr")>; +def: InstRW<[HWWriteResGroup136], (instregex "VSQRTPSr")>; +def: InstRW<[HWWriteResGroup136], (instregex "VSQRTSSr")>; -// y,m256. -def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 23; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; +def HWWriteResGroup137 : SchedWriteRes<[HWPort5]> { + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [2]; } -def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>; +def: InstRW<[HWWriteResGroup137], (instregex "AESIMCrr")>; +def: InstRW<[HWWriteResGroup137], (instregex "VAESIMCrr")>; -// VSQRTPD. -// y,y. -def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 28; +def HWWriteResGroup138 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup138], (instregex "DIVPDrm")>; +def: InstRW<[HWWriteResGroup138], (instregex "DIVSDrm")>; +def: InstRW<[HWWriteResGroup138], (instregex "VSQRTPSm")>; +def: InstRW<[HWWriteResGroup138], (instregex "VSQRTSSm")>; + +def HWWriteResGroup139 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 14; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>; +def: InstRW<[HWWriteResGroup139], (instregex "AESIMCrm")>; +def: InstRW<[HWWriteResGroup139], (instregex "VAESIMCrm")>; -// y,m256. -def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 32; +def HWWriteResGroup140 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> { + let Latency = 14; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>; +def: InstRW<[HWWriteResGroup140], (instregex "DPPSrri")>; +def: InstRW<[HWWriteResGroup140], (instregex "VDPPSYrri")>; +def: InstRW<[HWWriteResGroup140], (instregex "VDPPSrri")>; -// RSQRT SS/PS. -// x,x. -def WriteRSQRTr : SchedWriteRes<[HWPort0]> { - let Latency = 5; +def HWWriteResGroup141 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 14; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; } -def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>; +def: InstRW<[HWWriteResGroup141], (instregex "DPPSrmi")>; +def: InstRW<[HWWriteResGroup141], (instregex "VDPPSYrmi")>; +def: InstRW<[HWWriteResGroup141], (instregex "VDPPSrmi")>; -// x,m128. -def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 9; +def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,3,1,4]; +} +def: InstRW<[HWWriteResGroup142], (instregex "RCR8rCL")>; + +def HWWriteResGroup143 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 14; + let NumMicroOps = 15; + let ResourceCycles = [1,14]; +} +def: InstRW<[HWWriteResGroup143], (instregex "POPF16")>; + +def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 15; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[HWWriteResGroup144], (instregex "INSB")>; +def: InstRW<[HWWriteResGroup144], (instregex "INSL")>; +def: InstRW<[HWWriteResGroup144], (instregex "INSW")>; + +def HWWriteResGroup145 : SchedWriteRes<[HWPort5]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[HWWriteResGroup145], (instregex "VZEROALL")>; + +def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 16; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[HWWriteResGroup146], (instregex "CMPXCHG16B")>; + +def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[HWWriteResGroup147], (instregex "XCH_F")>; + +def HWWriteResGroup148 : SchedWriteRes<[HWPort0,HWPort5,HWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def: InstRW<[HWWriteResGroup148], (instregex "PCMPESTRIrr")>; +def: InstRW<[HWWriteResGroup148], (instregex "VPCMPESTRIrr")>; + +def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[HWWriteResGroup149], (instregex "CPUID")>; +def: InstRW<[HWWriteResGroup149], (instregex "RDTSC")>; + +def HWWriteResGroup150 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort0156]> { + let Latency = 18; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[HWWriteResGroup150], (instregex "PCMPESTRIrm")>; +def: InstRW<[HWWriteResGroup150], (instregex "VPCMPESTRIrm")>; + +def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> { + let Latency = 18; + let NumMicroOps = 19; + let ResourceCycles = [3,1,15]; +} +def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64?)")>; + +def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[HWWriteResGroup152], (instregex "PCMPESTRM128rr")>; +def: InstRW<[HWWriteResGroup152], (instregex "VPCMPESTRM128rr")>; + +def HWWriteResGroup153 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015,HWPort0156]> { + let Latency = 19; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} +def: InstRW<[HWWriteResGroup153], (instregex "PCMPESTRM128rm")>; +def: InstRW<[HWWriteResGroup153], (instregex "VPCMPESTRM128rm")>; + +def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup154], (instregex "DIV_FPrST0")>; +def: InstRW<[HWWriteResGroup154], (instregex "DIV_FST0r")>; +def: InstRW<[HWWriteResGroup154], (instregex "DIV_FrST0")>; +def: InstRW<[HWWriteResGroup154], (instregex "SQRTPDr")>; +def: InstRW<[HWWriteResGroup154], (instregex "SQRTSDr")>; +def: InstRW<[HWWriteResGroup154], (instregex "VDIVPDrr")>; +def: InstRW<[HWWriteResGroup154], (instregex "VDIVSDrr")>; + +def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 20; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>; +def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F32m")>; +def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F64m")>; +def: InstRW<[HWWriteResGroup155], (instregex "SQRTPDm")>; +def: InstRW<[HWWriteResGroup155], (instregex "SQRTSDm")>; +def: InstRW<[HWWriteResGroup155], (instregex "VDIVPDrm")>; +def: InstRW<[HWWriteResGroup155], (instregex "VDIVSDrm")>; -// RSQRTPS 256. -// y,y. -def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 7; +def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; +} +def: InstRW<[HWWriteResGroup156], (instregex "MWAITrr")>; + +def HWWriteResGroup157 : SchedWriteRes<[HWPort0]> { + let Latency = 21; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup157], (instregex "VSQRTPDr")>; +def: InstRW<[HWWriteResGroup157], (instregex "VSQRTSDr")>; + +def HWWriteResGroup158 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup158], (instregex "VSQRTPDm")>; +def: InstRW<[HWWriteResGroup158], (instregex "VSQRTSDm")>; + +def HWWriteResGroup159 : SchedWriteRes<[HWPort0,HWPort015]> { + let Latency = 21; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>; +def: InstRW<[HWWriteResGroup159], (instregex "VDIVPSYrr")>; +def: InstRW<[HWWriteResGroup159], (instregex "VSQRTPSYr")>; -// y,m256. -def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 11; +def HWWriteResGroup160 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> { + let Latency = 21; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>; +def: InstRW<[HWWriteResGroup160], (instregex "VDIVPSYrm")>; +def: InstRW<[HWWriteResGroup160], (instregex "VSQRTPSYm")>; -//-- Logic instructions --// +def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI16m")>; +def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI32m")>; -// AND, ANDN, OR, XOR PS/PD. -// x,x / v,v,v. -def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>; -// x,m / v,v,m. -def : InstRW<[WriteP5Ld, ReadAfterLd], - (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>; +def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> { + let Latency = 24; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FPrST0")>; +def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FST0r")>; +def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FrST0")>; -//-- Other instructions --// +def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 24; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup163], (instregex "DIV_F32m")>; +def: InstRW<[HWWriteResGroup163], (instregex "DIV_F64m")>; -// VZEROUPPER. -def WriteVZEROUPPER : SchedWriteRes<[]> { - let NumMicroOps = 4; +def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 24; + let NumMicroOps = 27; + let ResourceCycles = [1,5,1,1,19]; } -def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>; +def: InstRW<[HWWriteResGroup164], (instregex "XSAVE64")>; -// VZEROALL. -def WriteVZEROALL : SchedWriteRes<[]> { - let NumMicroOps = 12; +def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 25; + let NumMicroOps = 28; + let ResourceCycles = [1,6,1,1,19]; } -def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>; +def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT?)")>; -// LDMXCSR. -def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> { - let Latency = 6; +def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 27; let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>; +def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI16m")>; +def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI32m")>; -// STMXCSR. -def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> { - let Latency = 7; +def HWWriteResGroup167 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015]> { + let Latency = 28; + let NumMicroOps = 11; + let ResourceCycles = [2,7,1,1]; +} +def: InstRW<[HWWriteResGroup167], (instregex "AESKEYGENASSIST128rm")>; +def: InstRW<[HWWriteResGroup167], (instregex "VAESKEYGENASSIST128rm")>; + +def HWWriteResGroup168 : SchedWriteRes<[HWPort0,HWPort5,HWPort015]> { + let Latency = 29; + let NumMicroOps = 11; + let ResourceCycles = [2,7,2]; +} +def: InstRW<[HWWriteResGroup168], (instregex "AESKEYGENASSIST128rr")>; +def: InstRW<[HWWriteResGroup168], (instregex "VAESKEYGENASSIST128rr")>; + +def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> { + let Latency = 30; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[HWWriteResGroup170], (instregex "IN32ri")>; +def: InstRW<[HWWriteResGroup170], (instregex "IN32rr")>; +def: InstRW<[HWWriteResGroup170], (instregex "IN8ri")>; +def: InstRW<[HWWriteResGroup170], (instregex "IN8rr")>; + +def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 30; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[HWWriteResGroup171], (instregex "OUT32ir")>; +def: InstRW<[HWWriteResGroup171], (instregex "OUT32rr")>; +def: InstRW<[HWWriteResGroup171], (instregex "OUT8ir")>; +def: InstRW<[HWWriteResGroup171], (instregex "OUT8rr")>; + +def HWWriteResGroup172 : SchedWriteRes<[HWPort01,HWPort15,HWPort015,HWPort0156]> { + let Latency = 31; + let NumMicroOps = 31; + let ResourceCycles = [8,1,21,1]; +} +def: InstRW<[HWWriteResGroup172], (instregex "MMX_EMMS")>; + +def HWWriteResGroup173 : SchedWriteRes<[HWPort0,HWPort015]> { + let Latency = 35; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup173], (instregex "VDIVPDYrr")>; +def: InstRW<[HWWriteResGroup173], (instregex "VSQRTPDYr")>; + +def HWWriteResGroup174 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> { + let Latency = 35; let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup174], (instregex "VDIVPDYrm")>; +def: InstRW<[HWWriteResGroup174], (instregex "VSQRTPDYm")>; + +def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> { + let Latency = 35; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[HWWriteResGroup175], (instregex "VMCLEARm")>; + +def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[HWWriteResGroup176], (instregex "RDTSCP")>; + +def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> { + let Latency = 56; + let NumMicroOps = 64; + let ResourceCycles = [2,2,8,1,10,2,39]; +} +def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>; +def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>; + +def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> { + let Latency = 59; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[HWWriteResGroup178], (instregex "FXRSTOR64")>; + +def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> { + let Latency = 59; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[HWWriteResGroup179], (instregex "FXRSTOR")>; + +def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[HWWriteResGroup180], (instregex "FNINIT")>; + +def HWWriteResGroup181 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> { + let Latency = 98; + let NumMicroOps = 32; + let ResourceCycles = [7,7,3,3,1,11]; +} +def: InstRW<[HWWriteResGroup181], (instregex "DIV(16|32|64)r")>; + +def HWWriteResGroup182 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156]> { + let Latency = 112; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[HWWriteResGroup182], (instregex "IDIV(16|32|64)r")>; + +def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,HWPort237,HWPort06,HWPort0156]> { + let Latency = 114; + let NumMicroOps = 100; + let ResourceCycles = [9,9,11,8,1,11,21,30]; } -def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>; +def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>; +def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>; } // SchedModel Index: test/CodeGen/X86/avx-schedule.ll =================================================================== --- test/CodeGen/X86/avx-schedule.ll +++ test/CodeGen/X86/avx-schedule.ll @@ -23,8 +23,8 @@ ; HASWELL-LABEL: test_addpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addpd: ; BTVER2: # BB#0: @@ -59,8 +59,8 @@ ; HASWELL-LABEL: test_addps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addps: ; BTVER2: # BB#0: @@ -95,8 +95,8 @@ ; HASWELL-LABEL: test_addsubpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubpd: ; BTVER2: # BB#0: @@ -132,8 +132,8 @@ ; HASWELL-LABEL: test_addsubps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubps: ; BTVER2: # BB#0: @@ -171,9 +171,9 @@ ; HASWELL-LABEL: test_andnotpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotpd: ; BTVER2: # BB#0: @@ -219,9 +219,9 @@ ; HASWELL-LABEL: test_andnotps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotps: ; BTVER2: # BB#0: @@ -267,9 +267,9 @@ ; HASWELL-LABEL: test_andpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andpd: ; BTVER2: # BB#0: @@ -313,9 +313,9 @@ ; HASWELL-LABEL: test_andps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andps: ; BTVER2: # BB#0: @@ -360,8 +360,8 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33] ; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendpd: ; BTVER2: # BB#0: @@ -399,8 +399,8 @@ ; HASWELL-LABEL: test_blendps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33] -; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendps: ; BTVER2: # BB#0: @@ -435,8 +435,8 @@ ; HASWELL-LABEL: test_blendvpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] -; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvpd: ; BTVER2: # BB#0: @@ -472,8 +472,8 @@ ; HASWELL-LABEL: test_blendvps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] -; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [2:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvps: ; BTVER2: # BB#0: @@ -506,8 +506,8 @@ ; ; HASWELL-LABEL: test_broadcastf128: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_broadcastf128: ; BTVER2: # BB#0: @@ -536,8 +536,8 @@ ; ; HASWELL-LABEL: test_broadcastsd_ymm: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_broadcastsd_ymm: ; BTVER2: # BB#0: @@ -567,8 +567,8 @@ ; ; HASWELL-LABEL: test_broadcastss: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_broadcastss: ; BTVER2: # BB#0: @@ -598,8 +598,8 @@ ; ; HASWELL-LABEL: test_broadcastss_ymm: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_broadcastss_ymm: ; BTVER2: # BB#0: @@ -634,9 +634,9 @@ ; HASWELL-LABEL: test_cmppd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmppd: ; BTVER2: # BB#0: @@ -679,9 +679,9 @@ ; HASWELL-LABEL: test_cmpps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmpps: ; BTVER2: # BB#0: @@ -724,9 +724,9 @@ ; HASWELL-LABEL: test_cvtdq2pd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00] -; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [6:1.00] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2pd: ; BTVER2: # BB#0: @@ -767,10 +767,10 @@ ; ; HASWELL-LABEL: test_cvtdq2ps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] +; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [3:1.00] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2ps: ; BTVER2: # BB#0: @@ -810,9 +810,9 @@ ; HASWELL-LABEL: test_cvtpd2dq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00] -; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00] +; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00] ; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtpd2dq: ; BTVER2: # BB#0: @@ -851,10 +851,10 @@ ; ; HASWELL-LABEL: test_cvtpd2ps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00] +; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00] ; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtpd2ps: ; BTVER2: # BB#0: @@ -894,9 +894,9 @@ ; HASWELL-LABEL: test_cvtps2dq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00] +; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [3:1.00] ; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtps2dq: ; BTVER2: # BB#0: @@ -933,9 +933,9 @@ ; ; HASWELL-LABEL: test_divpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00] -; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [35:2.00] +; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [35:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divpd: ; BTVER2: # BB#0: @@ -969,9 +969,9 @@ ; ; HASWELL-LABEL: test_divps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00] -; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [21:2.00] +; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [21:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divps: ; BTVER2: # BB#0: @@ -1006,8 +1006,8 @@ ; HASWELL-LABEL: test_dpps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00] -; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_dpps: ; BTVER2: # BB#0: @@ -1045,9 +1045,9 @@ ; HASWELL-LABEL: test_extractf128: ; HASWELL: # BB#0: ; HASWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00] -; HASWELL-NEXT: vzeroupper # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: vzeroupper # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_extractf128: ; BTVER2: # BB#0: @@ -1083,8 +1083,8 @@ ; HASWELL-LABEL: test_haddpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddpd: ; BTVER2: # BB#0: @@ -1120,8 +1120,8 @@ ; HASWELL-LABEL: test_haddps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddps: ; BTVER2: # BB#0: @@ -1157,8 +1157,8 @@ ; HASWELL-LABEL: test_hsubpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubpd: ; BTVER2: # BB#0: @@ -1194,8 +1194,8 @@ ; HASWELL-LABEL: test_hsubps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] -; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubps: ; BTVER2: # BB#0: @@ -1233,9 +1233,9 @@ ; HASWELL-LABEL: test_insertf128: ; HASWELL: # BB#0: ; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00] -; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_insertf128: ; BTVER2: # BB#0: @@ -1272,8 +1272,8 @@ ; ; HASWELL-LABEL: test_lddqu: ; HASWELL: # BB#0: -; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lddqu: ; BTVER2: # BB#0: @@ -1306,10 +1306,10 @@ ; ; HASWELL-LABEL: test_maskmovpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00] -; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00] +; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [2:2.00] +; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:1.00] ; HASWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maskmovpd: ; BTVER2: # BB#0: @@ -1348,10 +1348,10 @@ ; ; HASWELL-LABEL: test_maskmovpd_ymm: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00] -; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00] +; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [2:2.00] +; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [4:1.00] ; HASWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maskmovpd_ymm: ; BTVER2: # BB#0: @@ -1390,10 +1390,10 @@ ; ; HASWELL-LABEL: test_maskmovps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00] -; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00] +; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [2:2.00] +; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:1.00] ; HASWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maskmovps: ; BTVER2: # BB#0: @@ -1432,10 +1432,10 @@ ; ; HASWELL-LABEL: test_maskmovps_ymm: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00] -; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00] +; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [2:2.00] +; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [4:1.00] ; HASWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maskmovps_ymm: ; BTVER2: # BB#0: @@ -1473,8 +1473,8 @@ ; HASWELL-LABEL: test_maxpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxpd: ; BTVER2: # BB#0: @@ -1510,8 +1510,8 @@ ; HASWELL-LABEL: test_maxps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxps: ; BTVER2: # BB#0: @@ -1547,8 +1547,8 @@ ; HASWELL-LABEL: test_minpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minpd: ; BTVER2: # BB#0: @@ -1584,8 +1584,8 @@ ; HASWELL-LABEL: test_minps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minps: ; BTVER2: # BB#0: @@ -1622,10 +1622,10 @@ ; ; HASWELL-LABEL: test_movapd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movapd: ; BTVER2: # BB#0: @@ -1663,10 +1663,10 @@ ; ; HASWELL-LABEL: test_movaps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movaps: ; BTVER2: # BB#0: @@ -1705,9 +1705,9 @@ ; HASWELL-LABEL: test_movddup: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00] -; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50] +; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [1:0.50] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movddup: ; BTVER2: # BB#0: @@ -1744,9 +1744,9 @@ ; ; HASWELL-LABEL: test_movmskpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00] -; HASWELL-NEXT: vzeroupper # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [3:1.00] +; HASWELL-NEXT: vzeroupper # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movmskpd: ; BTVER2: # BB#0: @@ -1778,9 +1778,9 @@ ; ; HASWELL-LABEL: test_movmskps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00] -; HASWELL-NEXT: vzeroupper # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00] +; HASWELL-NEXT: vzeroupper # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movmskps: ; BTVER2: # BB#0: @@ -1814,7 +1814,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movntpd: ; BTVER2: # BB#0: @@ -1849,7 +1849,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movntps: ; BTVER2: # BB#0: @@ -1885,9 +1885,9 @@ ; HASWELL-LABEL: test_movshdup: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00] -; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50] +; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [1:0.50] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movshdup: ; BTVER2: # BB#0: @@ -1927,9 +1927,9 @@ ; HASWELL-LABEL: test_movsldup: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00] -; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50] +; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [1:0.50] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movsldup: ; BTVER2: # BB#0: @@ -1970,10 +1970,10 @@ ; ; HASWELL-LABEL: test_movupd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movupd: ; BTVER2: # BB#0: @@ -2013,10 +2013,10 @@ ; ; HASWELL-LABEL: test_movups: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movups: ; BTVER2: # BB#0: @@ -2052,9 +2052,9 @@ ; ; HASWELL-LABEL: test_mulpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulpd: ; BTVER2: # BB#0: @@ -2088,9 +2088,9 @@ ; ; HASWELL-LABEL: test_mulps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulps: ; BTVER2: # BB#0: @@ -2127,9 +2127,9 @@ ; HASWELL-LABEL: orpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: orpd: ; BTVER2: # BB#0: @@ -2173,9 +2173,9 @@ ; HASWELL-LABEL: test_orps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_orps: ; BTVER2: # BB#0: @@ -2219,9 +2219,9 @@ ; HASWELL-LABEL: test_permilpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00] -; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00] +; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilpd: ; BTVER2: # BB#0: @@ -2261,9 +2261,9 @@ ; HASWELL-LABEL: test_permilpd_ymm: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00] -; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00] +; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilpd_ymm: ; BTVER2: # BB#0: @@ -2303,9 +2303,9 @@ ; HASWELL-LABEL: test_permilps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00] -; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilps: ; BTVER2: # BB#0: @@ -2345,9 +2345,9 @@ ; HASWELL-LABEL: test_permilps_ymm: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] -; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00] +; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [1:1.00] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilps_ymm: ; BTVER2: # BB#0: @@ -2385,8 +2385,8 @@ ; HASWELL-LABEL: test_permilvarpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilvarpd: ; BTVER2: # BB#0: @@ -2422,8 +2422,8 @@ ; HASWELL-LABEL: test_permilvarpd_ymm: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilvarpd_ymm: ; BTVER2: # BB#0: @@ -2459,8 +2459,8 @@ ; HASWELL-LABEL: test_permilvarps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilvarps: ; BTVER2: # BB#0: @@ -2496,8 +2496,8 @@ ; HASWELL-LABEL: test_permilvarps_ymm: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_permilvarps_ymm: ; BTVER2: # BB#0: @@ -2535,9 +2535,9 @@ ; HASWELL-LABEL: test_rcpps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00] -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rcpps: ; BTVER2: # BB#0: @@ -2577,10 +2577,10 @@ ; ; HASWELL-LABEL: test_roundpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00] -; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00] +; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [5:1.25] +; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [6:2.00] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundpd: ; BTVER2: # BB#0: @@ -2620,10 +2620,10 @@ ; ; HASWELL-LABEL: test_roundps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:2.00] -; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:2.00] +; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [5:1.25] +; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [6:2.00] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundps: ; BTVER2: # BB#0: @@ -2664,9 +2664,9 @@ ; HASWELL-LABEL: test_rsqrtps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00] -; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00] +; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [11:2.00] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rsqrtps: ; BTVER2: # BB#0: @@ -2707,9 +2707,9 @@ ; HASWELL-LABEL: test_shufpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00] -; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_shufpd: ; BTVER2: # BB#0: @@ -2747,8 +2747,8 @@ ; HASWELL-LABEL: test_shufps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00] -; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_shufps: ; BTVER2: # BB#0: @@ -2784,10 +2784,10 @@ ; ; HASWELL-LABEL: test_sqrtpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [32:2.00] -; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [28:2.00] +; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [35:2.00] +; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [35:2.00] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtpd: ; BTVER2: # BB#0: @@ -2827,10 +2827,10 @@ ; ; HASWELL-LABEL: test_sqrtps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [23:2.00] -; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [19:2.00] +; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [21:2.00] +; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:2.00] ; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtps: ; BTVER2: # BB#0: @@ -2869,8 +2869,8 @@ ; HASWELL-LABEL: test_subpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subpd: ; BTVER2: # BB#0: @@ -2905,8 +2905,8 @@ ; HASWELL-LABEL: test_subps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subps: ; BTVER2: # BB#0: @@ -2947,11 +2947,11 @@ ; HASWELL-LABEL: test_testpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] -; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: setb %al # sched: [1:0.50] -; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_testpd: ; BTVER2: # BB#0: @@ -3002,12 +3002,12 @@ ; HASWELL-LABEL: test_testpd_ymm: ; HASWELL: # BB#0: ; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] -; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33] +; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: setb %al # sched: [1:0.50] -; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] -; HASWELL-NEXT: vzeroupper # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vzeroupper # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_testpd_ymm: ; BTVER2: # BB#0: @@ -3057,11 +3057,11 @@ ; HASWELL-LABEL: test_testps: ; HASWELL: # BB#0: ; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] -; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: setb %al # sched: [1:0.50] -; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_testps: ; BTVER2: # BB#0: @@ -3112,12 +3112,12 @@ ; HASWELL-LABEL: test_testps_ymm: ; HASWELL: # BB#0: ; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25] -; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33] +; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: setb %al # sched: [1:0.50] -; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50] -; HASWELL-NEXT: vzeroupper # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vzeroupper # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_testps_ymm: ; BTVER2: # BB#0: @@ -3163,9 +3163,9 @@ ; HASWELL-LABEL: test_unpckhpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpckhpd: ; BTVER2: # BB#0: @@ -3203,8 +3203,8 @@ ; HASWELL-LABEL: test_unpckhps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpckhps: ; BTVER2: # BB#0: @@ -3241,9 +3241,9 @@ ; HASWELL-LABEL: test_unpcklpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpcklpd: ; BTVER2: # BB#0: @@ -3281,8 +3281,8 @@ ; HASWELL-LABEL: test_unpcklps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpcklps: ; BTVER2: # BB#0: @@ -3319,9 +3319,9 @@ ; HASWELL-LABEL: test_xorpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorpd: ; BTVER2: # BB#0: @@ -3365,9 +3365,9 @@ ; HASWELL-LABEL: test_xorps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] -; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorps: ; BTVER2: # BB#0: @@ -3406,8 +3406,8 @@ ; ; HASWELL-LABEL: test_zeroall: ; HASWELL: # BB#0: -; HASWELL-NEXT: vzeroall # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vzeroall # sched: [16:16.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_zeroall: ; BTVER2: # BB#0: @@ -3436,8 +3436,8 @@ ; ; HASWELL-LABEL: test_zeroupper: ; HASWELL: # BB#0: -; HASWELL-NEXT: vzeroupper # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vzeroupper # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_zeroupper: ; BTVER2: # BB#0: Index: test/CodeGen/X86/avx2-schedule.ll =================================================================== --- test/CodeGen/X86/avx2-schedule.ll +++ test/CodeGen/X86/avx2-schedule.ll @@ -15,9 +15,9 @@ ; HASWELL-LABEL: test_pabsb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [5:0.50] +; HASWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [1:0.50] ; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pabsb: ; ZNVER1: # BB#0: @@ -44,9 +44,9 @@ ; HASWELL-LABEL: test_pabsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [5:0.50] +; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [1:0.50] ; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pabsd: ; ZNVER1: # BB#0: @@ -73,9 +73,9 @@ ; HASWELL-LABEL: test_pabsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [5:0.50] +; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [1:0.50] ; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pabsw: ; ZNVER1: # BB#0: @@ -101,8 +101,8 @@ ; HASWELL-LABEL: test_paddb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_paddb: ; ZNVER1: # BB#0: @@ -125,8 +125,8 @@ ; HASWELL-LABEL: test_paddd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_paddd: ; ZNVER1: # BB#0: @@ -149,8 +149,8 @@ ; HASWELL-LABEL: test_paddq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_paddq: ; ZNVER1: # BB#0: @@ -173,8 +173,8 @@ ; HASWELL-LABEL: test_paddw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_paddw: ; ZNVER1: # BB#0: @@ -198,9 +198,9 @@ ; HASWELL-LABEL: test_pand: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pand: ; ZNVER1: # BB#0: @@ -226,9 +226,9 @@ ; HASWELL-LABEL: test_pandn: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [5:0.50] +; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pandn: ; ZNVER1: # BB#0: @@ -256,7 +256,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00] ; HASWELL-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [10:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pmulld: ; ZNVER1: # BB#0: @@ -279,8 +279,8 @@ ; HASWELL-LABEL: test_pmullw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pmullw: ; ZNVER1: # BB#0: @@ -304,9 +304,9 @@ ; HASWELL-LABEL: test_por: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_por: ; ZNVER1: # BB#0: @@ -331,8 +331,8 @@ ; HASWELL-LABEL: test_psubb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_psubb: ; ZNVER1: # BB#0: @@ -355,8 +355,8 @@ ; HASWELL-LABEL: test_psubd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_psubd: ; ZNVER1: # BB#0: @@ -379,8 +379,8 @@ ; HASWELL-LABEL: test_psubq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_psubq: ; ZNVER1: # BB#0: @@ -403,8 +403,8 @@ ; HASWELL-LABEL: test_psubw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_psubw: ; ZNVER1: # BB#0: @@ -428,9 +428,9 @@ ; HASWELL-LABEL: test_pxor: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pxor: ; ZNVER1: # BB#0: Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -126,11 +126,11 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { ; ALL-LABEL: test8: ; ALL: ## BB#0: -; ALL-NEXT: notl %edi ; ALL-NEXT: xorl $-2147483648, %esi ## imm = 0x80000000 ; ALL-NEXT: testl %edx, %edx ; ALL-NEXT: movl $1, %eax ; ALL-NEXT: cmovel %eax, %edx +; ALL-NEXT: notl %edi ; ALL-NEXT: orl %edi, %esi ; ALL-NEXT: cmovnel %edx, %eax ; ALL-NEXT: retq Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -1545,19 +1545,19 @@ } define <2 x float> @uitofp_2i1_float(<2 x i32> %a) { -; NOVL-LABEL: uitofp_2i1_float: -; NOVL: # BB#0: -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpextrb $8, %xmm0, %eax -; NOVL-NEXT: andl $1, %eax -; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 -; NOVL-NEXT: vpextrb $0, %xmm0, %eax -; NOVL-NEXT: andl $1, %eax -; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 -; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; NOVL-NEXT: retq +; KNL-LABEL: uitofp_2i1_float: +; KNL: # BB#0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vpextrb $8, %xmm0, %eax +; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpextrb $0, %xmm0, %ecx +; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; KNL-NEXT: andl $1, %ecx +; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm1 +; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; KNL-NEXT: retq ; ; VL-LABEL: uitofp_2i1_float: ; VL: # BB#0: @@ -1567,6 +1567,34 @@ ; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; VL-NEXT: retq +; +; AVX512DQ-LABEL: uitofp_2i1_float: +; AVX512DQ: # BB#0: +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vpextrb $8, %xmm0, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax +; AVX512DQ-NEXT: andl $1, %eax +; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: uitofp_2i1_float: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 +; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 +; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX512BW-NEXT: retq %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -48,8 +48,8 @@ define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { ; KNL-LABEL: zext_16x8mem_to_16x16: ; KNL: # BB#0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -70,8 +70,8 @@ define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { ; KNL-LABEL: sext_16x8mem_to_16x16: ; KNL: # BB#0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vpmovsxbw (%rdi), %ymm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -974,73 +974,73 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $32, %rsp -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0 -; KNL-NEXT: kshiftlw $14, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx -; KNL-NEXT: vmovd %edx, %xmm1 -; KNL-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $14, %k0, %k1 +; KNL-NEXT: kshiftrw $15, %k1, %k1 +; KNL-NEXT: vmovd %eax, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; KNL-NEXT: xorl %eax, %eax +; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 +; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 @@ -1159,8 +1159,8 @@ ; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; KNL-NEXT: vpextrb $4, %xmm0, %ecx ; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpextrb $0, %xmm0, %ecx +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] @@ -1942,14 +1942,23 @@ } define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) { -; CHECK-LABEL: test_extractelement_variable_v16i8: -; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %EDI %EDI %RDI -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movb (%rdi,%rax), %al -; CHECK-NEXT: retq +; KNL-LABEL: test_extractelement_variable_v16i8: +; KNL: ## BB#0: +; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: ## kill: %EDI %EDI %RDI +; KNL-NEXT: andl $15, %edi +; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: retq +; +; SKX-LABEL: test_extractelement_variable_v16i8: +; SKX: ## BB#0: +; SKX-NEXT: ## kill: %EDI %EDI %RDI +; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; SKX-NEXT: andl $15, %edi +; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; SKX-NEXT: movb (%rdi,%rax), %al +; SKX-NEXT: retq %t2 = extractelement <16 x i8> %t1, i32 %index ret i8 %t2 } @@ -1967,8 +1976,8 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $64, %rsp -; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: andl $31, %edi ; KNL-NEXT: movq %rsp, %rax ; KNL-NEXT: movb (%rdi,%rax), %al @@ -2015,9 +2024,9 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp -; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovaps %ymm0, (%rsp) +; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: andl $63, %edi ; KNL-NEXT: movq %rsp, %rax ; KNL-NEXT: movb (%rdi,%rax), %al @@ -2106,12 +2115,12 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) { ; KNL-LABEL: test_extractelement_varible_v2i1: ; KNL: ## BB#0: -; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: movl -24(%rsp,%rdi,8), %eax ; KNL-NEXT: andl $1, %eax @@ -2136,12 +2145,12 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) { ; KNL-LABEL: test_extractelement_varible_v4i1: ; KNL: ## BB#0: -; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: ## kill: %EDI %EDI %RDI ; KNL-NEXT: andl $3, %edi ; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax ; KNL-NEXT: andl $1, %eax Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2880,7 +2880,6 @@ define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { ; CHECK-LABEL: test_mask_vextractf32x4: ; CHECK: ## BB#0: -; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1 ; CHECK-NEXT: kmovw %edi, %k0 ; CHECK-NEXT: kshiftlw $12, %k0, %k1 ; CHECK-NEXT: kshiftrw $15, %k1, %k1 @@ -2898,6 +2897,7 @@ ; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq @@ -2941,7 +2941,6 @@ define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { ; CHECK-LABEL: test_maskz_vextracti32x4: ; CHECK: ## BB#0: -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 ; CHECK-NEXT: kmovw %edi, %k0 ; CHECK-NEXT: kshiftlw $12, %k0, %k1 ; CHECK-NEXT: kshiftrw $15, %k1, %k1 @@ -2959,6 +2958,7 @@ ; CHECK-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; CHECK-NEXT: kmovw %k1, %eax ; CHECK-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -1835,73 +1835,8 @@ ; KNL-NEXT: .cfi_def_cfa_register %rbp ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $32, %rsp -; KNL-NEXT: vmovups (%rdi), %zmm2 -; KNL-NEXT: vmovups 64(%rdi), %zmm3 -; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1 -; KNL-NEXT: kshiftlw $14, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: kshiftlw $15, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vmovd %ecx, %xmm3 -; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $13, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $12, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $11, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $10, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $9, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $8, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $7, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $6, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $5, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $4, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $3, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $2, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftlw $1, %k1, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 -; KNL-NEXT: kshiftrw $15, %k1, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2 +; KNL-NEXT: vmovups 64(%rdi), %zmm2 +; KNL-NEXT: vcmpltps %zmm1, %zmm2, %k2 ; KNL-NEXT: kshiftlw $14, %k2, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1965,138 +1900,203 @@ ; KNL-NEXT: kshiftrw $15, %k2, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z} -; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z} +; KNL-NEXT: vmovups (%rdi), %zmm3 +; KNL-NEXT: vcmpltps %zmm0, %zmm3, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $15, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: vmovd %ecx, %xmm3 +; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $12, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $11, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $10, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $9, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $8, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $7, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $6, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $5, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $4, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $3, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $2, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftlw $1, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; KNL-NEXT: kshiftrw $15, %k1, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z} ; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0 -; KNL-NEXT: kshiftlw $14, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftlw $15, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %ecx ; KNL-NEXT: vmovd %ecx, %xmm4 ; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $13, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $12, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $11, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $10, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $9, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $8, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $7, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $7, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $6, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $6, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $5, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $5, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $4, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $4, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $3, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $3, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $2, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; KNL-NEXT: kshiftlw $1, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftlw $1, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 -; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0 +; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm3 +; KNL-NEXT: vmovups 4(%rdi), %zmm4 {%k1} {z} +; KNL-NEXT: vcmpltps %zmm4, %zmm0, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm3 -; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; KNL-NEXT: vmovd %ecx, %xmm4 +; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; KNL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2 ; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 @@ -2941,36 +2941,6 @@ ; ; KNL-LABEL: store_64i1: ; KNL: ## BB#0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: Lcfi9: -; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: pushq %r15 -; KNL-NEXT: Lcfi10: -; KNL-NEXT: .cfi_def_cfa_offset 24 -; KNL-NEXT: pushq %r14 -; KNL-NEXT: Lcfi11: -; KNL-NEXT: .cfi_def_cfa_offset 32 -; KNL-NEXT: pushq %r13 -; KNL-NEXT: Lcfi12: -; KNL-NEXT: .cfi_def_cfa_offset 40 -; KNL-NEXT: pushq %r12 -; KNL-NEXT: Lcfi13: -; KNL-NEXT: .cfi_def_cfa_offset 48 -; KNL-NEXT: pushq %rbx -; KNL-NEXT: Lcfi14: -; KNL-NEXT: .cfi_def_cfa_offset 56 -; KNL-NEXT: Lcfi15: -; KNL-NEXT: .cfi_offset %rbx, -56 -; KNL-NEXT: Lcfi16: -; KNL-NEXT: .cfi_offset %r12, -48 -; KNL-NEXT: Lcfi17: -; KNL-NEXT: .cfi_offset %r13, -40 -; KNL-NEXT: Lcfi18: -; KNL-NEXT: .cfi_offset %r14, -32 -; KNL-NEXT: Lcfi19: -; KNL-NEXT: .cfi_offset %r15, -24 -; KNL-NEXT: Lcfi20: -; KNL-NEXT: .cfi_offset %rbp, -16 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -2982,281 +2952,275 @@ ; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: vmovd %ecx, %xmm3 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: vpinsrb $4, %ecx, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $8, %ecx, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %r9d, %xmm3 -; KNL-NEXT: kmovw %k1, %r9d -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2 -; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2 -; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 -; KNL-NEXT: kmovw %k0, 6(%rdi) -; KNL-NEXT: kshiftlw $14, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r8d -; KNL-NEXT: kshiftlw $15, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r10d -; KNL-NEXT: kshiftlw $13, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r9d -; KNL-NEXT: kshiftlw $12, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r11d -; KNL-NEXT: kshiftlw $11, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r14d -; KNL-NEXT: kshiftlw $10, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r15d -; KNL-NEXT: kshiftlw $9, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r12d -; KNL-NEXT: kshiftlw $8, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r13d -; KNL-NEXT: kshiftlw $7, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: kshiftlw $6, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %esi -; KNL-NEXT: kshiftlw $5, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ebp -; KNL-NEXT: kshiftlw $4, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ebx -; KNL-NEXT: kshiftlw $3, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: kshiftlw $2, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: kshiftlw $1, %k2, %k0 +; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %r10d, %xmm2 -; KNL-NEXT: kmovw %k0, %r10d -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; KNL-NEXT: kshiftrw $15, %k2, %k0 -; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1 -; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 -; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1 +; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm2 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, 4(%rdi) ; KNL-NEXT: kshiftlw $14, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r8d +; KNL-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $15, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r10d +; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $13, %k1, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r9d +; KNL-NEXT: vmovd %eax, %xmm3 +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $12, %k1, %k0 +; KNL-NEXT: vpinsrb $1, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r11d +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $11, %k1, %k0 +; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r14d +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $10, %k1, %k0 +; KNL-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r15d +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $9, %k1, %k0 +; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r12d +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $8, %k1, %k0 +; KNL-NEXT: vpinsrb $5, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %r13d +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $7, %k1, %k0 +; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $6, %k1, %k0 +; KNL-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %esi +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $5, %k1, %k0 +; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ebp +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $4, %k1, %k0 +; KNL-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ebx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $3, %k1, %k0 +; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: kshiftlw $2, %k1, %k0 +; KNL-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %edx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: kshiftlw $1, %k1, %k0 +; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vmovd %r10d, %xmm1 -; KNL-NEXT: kmovw %k0, %r10d -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0 -; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: kmovw %k1, 2(%rdi) +; KNL-NEXT: vpmovsxbd %xmm2, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm2 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm1 +; KNL-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, 6(%rdi) +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: kmovw %k1, 4(%rdi) ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r8d +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r9d +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r10d +; KNL-NEXT: vmovd %ecx, %xmm1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r11d +; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r14d +; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r15d +; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r12d +; KNL-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %esi +; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebp +; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ebx +; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: vmovd %r9d, %xmm0 -; KNL-NEXT: kmovw %k1, %r9d -; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm0 +; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, 2(%rdi) +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $15, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $12, %k1, %k0 +; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $11, %k1, %k0 +; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $10, %k1, %k0 +; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $9, %k1, %k0 +; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $8, %k1, %k0 +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $7, %k1, %k0 +; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $6, %k1, %k0 +; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $5, %k1, %k0 +; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $4, %k1, %k0 +; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $3, %k1, %k0 +; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftlw $2, %k1, %k0 +; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kshiftlw $1, %k1, %k0 +; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: kshiftrw $15, %k1, %k0 +; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, (%rdi) -; KNL-NEXT: popq %rbx -; KNL-NEXT: popq %r12 -; KNL-NEXT: popq %r13 -; KNL-NEXT: popq %r14 -; KNL-NEXT: popq %r15 -; KNL-NEXT: popq %rbp ; KNL-NEXT: retq ; ; SKX-LABEL: store_64i1: Index: test/CodeGen/X86/avx512-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-vec-cmp.ll +++ test/CodeGen/X86/avx512-vec-cmp.ll @@ -577,72 +577,72 @@ ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vpslld $31, %zmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kshiftlw $14, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftlw $15, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: vmovd %ecx, %xmm0 -; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; KNL-NEXT: vmovd %ecx, %xmm1 +; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $13, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $12, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $11, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $10, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $9, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $8, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $7, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $6, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $5, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $4, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $3, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; KNL-NEXT: kshiftlw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; KNL-NEXT: kshiftlw $1, %k0, %k1 ; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm0 +; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1685,8 +1685,6 @@ ; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: .Lcfi9: ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al @@ -1707,39 +1705,39 @@ ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %ebx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al @@ -1748,8 +1746,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al @@ -1758,8 +1756,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al @@ -1767,8 +1765,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al @@ -1777,8 +1775,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl @@ -1789,8 +1787,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al @@ -1798,8 +1796,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1809,8 +1807,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1820,8 +1818,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1831,8 +1829,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1842,8 +1840,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -1852,8 +1850,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1864,8 +1862,8 @@ ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %bl @@ -1877,8 +1875,8 @@ ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl @@ -1887,8 +1885,8 @@ ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1898,8 +1896,8 @@ ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1910,8 +1908,8 @@ ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -1921,8 +1919,8 @@ ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX @@ -1932,464 +1930,464 @@ ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 ; AVX512F-32-NEXT: shrl $12, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 ; AVX512F-32-NEXT: shrl $14, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 ; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: shrl $16, %ebx ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: andb $15, %al ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX ; AVX512F-32-NEXT: shrb $7, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 @@ -2397,29 +2395,29 @@ ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 -; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1} -; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1} -; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1} -; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1} +; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kxorq %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k2 {%k1} +; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) @@ -2571,8 +2569,6 @@ ; AVX512F-32-NEXT: .cfi_offset %esi, -12 ; AVX512F-32-NEXT: .Lcfi15: ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al @@ -2593,39 +2589,39 @@ ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: kmovd %ecx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 ; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %ebx, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 ; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al @@ -2634,8 +2630,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al @@ -2644,8 +2640,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al @@ -2653,8 +2649,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al @@ -2663,8 +2659,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl @@ -2675,8 +2671,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al @@ -2684,8 +2680,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2695,8 +2691,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2706,8 +2702,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2717,8 +2713,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2728,8 +2724,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax @@ -2738,8 +2734,8 @@ ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2750,8 +2746,8 @@ ; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %bl @@ -2763,8 +2759,8 @@ ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl @@ -2773,8 +2769,8 @@ ; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2784,8 +2780,8 @@ ; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2796,8 +2792,8 @@ ; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl @@ -2807,8 +2803,8 @@ ; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %AL %AL %EAX %EAX @@ -2818,464 +2814,464 @@ ; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 ; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5 +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: movl %ecx, %esi ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $30, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %esi, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; AVX512F-32-NEXT: kmovd %ecx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %cl, %al ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000 ; AVX512F-32-NEXT: shrl $12, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $13, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000 ; AVX512F-32-NEXT: shrl $14, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000 ; AVX512F-32-NEXT: shrl $15, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: shrl $16, %ebx ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: andb $15, %al ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %bl, %al ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: # kill: %BL %BL %EBX %EBX ; AVX512F-32-NEXT: shrb $7, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $24, %eax ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6] +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: movb %al, %dl ; AVX512F-32-NEXT: andb $15, %dl ; AVX512F-32-NEXT: movb %dl, %al ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: shrb $3, %al -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $29, %eax ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $28, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3 -; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1 -; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] -; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 +; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 +; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2] +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $30, %eax -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0 -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-32-NEXT: kmovd %eax, %k0 -; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3 -; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 -; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0 +; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4 +; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 +; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrl $31, %eax ; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0 @@ -3283,29 +3279,29 @@ ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1 ; AVX512F-32-NEXT: korq %k1, %k0, %k1 -; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1} -; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1} -; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1} -; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1} -; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1} -; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1} +; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ; AVX512F-32-NEXT: kmovq %k0, (%esp) ; AVX512F-32-NEXT: movl (%esp), %eax ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kxorq %k0, %k0, %k0 ; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax -; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} +; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 {%k1} +; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1} +; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx -; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp) +; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp) ; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx ; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp) Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -2695,32 +2695,32 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: test_cmp_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0] -; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] -; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] ; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04] -; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xc0,0x02] -; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8] +; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] +; CHECK-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; CHECK-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd2,0x02] ; CHECK-NEXT: kxnord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x46,0xc0] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03] -; CHECK-NEXT: vmovd %ecx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] -; CHECK-NEXT: vmovd %r8d, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0] -; CHECK-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] -; CHECK-NEXT: ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-NEXT: vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] -; CHECK-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] -; CHECK-NEXT: ## xmm1 = xmm1[0],xmm2[0] -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03] +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] +; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] +; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] +; CHECK-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] +; CHECK-NEXT: vpunpckldq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x62,0xc0] +; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: vmovd %edx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca] +; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -2750,23 +2750,23 @@ ; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8] ; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] -; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0] -; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] -; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] ; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] ; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] -; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] -; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01] -; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02] +; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] +; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] +; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01] +; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02] ; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] ; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8] ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] -; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] -; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] +; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02] +; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03] ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) @@ -2793,32 +2793,32 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: test_ucmp_b_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0] -; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] -; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] -; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] ; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04] -; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05] -; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8] +; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] ; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] -; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x02] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] +; CHECK-NEXT: vmovd %eax, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; CHECK-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; CHECK-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd2,0x02] ; CHECK-NEXT: kxnord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x46,0xc0] ; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03] -; CHECK-NEXT: vmovd %ecx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc9] -; CHECK-NEXT: vmovd %r8d, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xd0] -; CHECK-NEXT: vpunpckldq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9] -; CHECK-NEXT: ## xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-NEXT: vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2] -; CHECK-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca] -; CHECK-NEXT: ## xmm1 = xmm1[0],xmm2[0] -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03] +; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] +; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] +; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] +; CHECK-NEXT: vmovd %eax, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc8] +; CHECK-NEXT: vpunpckldq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x62,0xc0] +; CHECK-NEXT: ## xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: vmovd %edx, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca] +; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xc1] +; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 @@ -2848,23 +2848,23 @@ ; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8] ; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] -; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0] -; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] -; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] +; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] ; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04] -; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0] ; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05] -; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] ; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06] -; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0] -; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0] -; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01] -; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02] +; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8] +; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6] +; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01] +; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02] ; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03] ; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8] ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01] -; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02] -; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03] +; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02] +; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0] +; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0] +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03] ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01] ; CHECK-NEXT: retq ## encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) Index: test/CodeGen/X86/avx512vl-vec-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -314,8 +314,8 @@ ; ; NoVLX-LABEL: test256_11: ; NoVLX: # BB#0: -; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 ; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm3 +; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 ; NoVLX-NEXT: vpand %ymm2, %ymm3, %ymm2 ; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: retq @@ -824,8 +824,8 @@ ; ; NoVLX-LABEL: test128_11: ; NoVLX: # BB#0: -; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm3 +; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 ; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2 ; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: retq Index: test/CodeGen/X86/avx512vl-vec-masked-cmp.ll =================================================================== --- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -100,9 +100,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -225,9 +225,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -353,9 +353,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -482,9 +482,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -530,23 +530,8 @@ ; NoVLX-NEXT: movq %rsp, %rbp ; NoVLX-NEXT: .Lcfi34: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi35: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi36: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi37: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi38: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi39: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -557,64 +542,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -626,12 +611,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -653,30 +633,15 @@ ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi40: +; NoVLX-NEXT: .Lcfi35: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi41: +; NoVLX-NEXT: .Lcfi36: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi42: +; NoVLX-NEXT: .Lcfi37: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi43: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi44: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi45: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi46: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi47: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -687,64 +652,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -756,12 +721,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -785,30 +745,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi48: +; NoVLX-NEXT: .Lcfi38: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi49: +; NoVLX-NEXT: .Lcfi39: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi50: +; NoVLX-NEXT: .Lcfi40: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi51: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi52: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi53: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi54: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi55: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -820,64 +765,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -889,12 +834,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -919,30 +859,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi56: +; NoVLX-NEXT: .Lcfi41: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi57: +; NoVLX-NEXT: .Lcfi42: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi58: +; NoVLX-NEXT: .Lcfi43: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi59: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi60: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi61: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi62: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi63: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -954,64 +879,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -1023,12 +948,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -1055,12 +975,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi64: +; NoVLX-NEXT: .Lcfi44: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi65: +; NoVLX-NEXT: .Lcfi45: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi66: +; NoVLX-NEXT: .Lcfi46: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1104,12 +1024,12 @@ ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi67: +; NoVLX-NEXT: .Lcfi47: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi68: +; NoVLX-NEXT: .Lcfi48: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi69: +; NoVLX-NEXT: .Lcfi49: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1155,12 +1075,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi70: +; NoVLX-NEXT: .Lcfi50: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi71: +; NoVLX-NEXT: .Lcfi51: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi72: +; NoVLX-NEXT: .Lcfi52: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -1216,12 +1136,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi73: +; NoVLX-NEXT: .Lcfi53: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi74: +; NoVLX-NEXT: .Lcfi54: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi75: +; NoVLX-NEXT: .Lcfi55: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -1400,12 +1320,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi76: +; NoVLX-NEXT: .Lcfi56: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi77: +; NoVLX-NEXT: .Lcfi57: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi78: +; NoVLX-NEXT: .Lcfi58: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1417,35 +1337,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -1475,12 +1395,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi79: +; NoVLX-NEXT: .Lcfi59: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi80: +; NoVLX-NEXT: .Lcfi60: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi81: +; NoVLX-NEXT: .Lcfi61: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1492,35 +1412,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -1552,12 +1472,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi82: +; NoVLX-NEXT: .Lcfi62: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi83: +; NoVLX-NEXT: .Lcfi63: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi84: +; NoVLX-NEXT: .Lcfi64: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1570,35 +1490,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -1631,12 +1551,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi85: +; NoVLX-NEXT: .Lcfi65: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi86: +; NoVLX-NEXT: .Lcfi66: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi87: +; NoVLX-NEXT: .Lcfi67: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -1649,35 +1569,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -1711,12 +1631,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi88: +; NoVLX-NEXT: .Lcfi68: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi89: +; NoVLX-NEXT: .Lcfi69: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi90: +; NoVLX-NEXT: .Lcfi70: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1791,12 +1711,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi91: +; NoVLX-NEXT: .Lcfi71: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi92: +; NoVLX-NEXT: .Lcfi72: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi93: +; NoVLX-NEXT: .Lcfi73: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1873,12 +1793,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi94: +; NoVLX-NEXT: .Lcfi74: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi95: +; NoVLX-NEXT: .Lcfi75: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi96: +; NoVLX-NEXT: .Lcfi76: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -1957,12 +1877,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi97: +; NoVLX-NEXT: .Lcfi77: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi98: +; NoVLX-NEXT: .Lcfi78: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi99: +; NoVLX-NEXT: .Lcfi79: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -2043,12 +1963,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi100: +; NoVLX-NEXT: .Lcfi80: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi101: +; NoVLX-NEXT: .Lcfi81: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi102: +; NoVLX-NEXT: .Lcfi82: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2057,15 +1977,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi103: +; NoVLX-NEXT: .Lcfi83: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi104: +; NoVLX-NEXT: .Lcfi84: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi105: +; NoVLX-NEXT: .Lcfi85: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi106: +; NoVLX-NEXT: .Lcfi86: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi107: +; NoVLX-NEXT: .Lcfi87: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -2131,9 +2051,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -2169,12 +2089,12 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi108: +; NoVLX-NEXT: .Lcfi88: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi109: +; NoVLX-NEXT: .Lcfi89: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi110: +; NoVLX-NEXT: .Lcfi90: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2183,15 +2103,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi111: +; NoVLX-NEXT: .Lcfi91: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi112: +; NoVLX-NEXT: .Lcfi92: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi113: +; NoVLX-NEXT: .Lcfi93: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi114: +; NoVLX-NEXT: .Lcfi94: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi115: +; NoVLX-NEXT: .Lcfi95: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -2257,9 +2177,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -2297,12 +2217,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi116: +; NoVLX-NEXT: .Lcfi96: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi117: +; NoVLX-NEXT: .Lcfi97: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi118: +; NoVLX-NEXT: .Lcfi98: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2311,15 +2231,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi119: +; NoVLX-NEXT: .Lcfi99: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi120: +; NoVLX-NEXT: .Lcfi100: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi121: +; NoVLX-NEXT: .Lcfi101: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi122: +; NoVLX-NEXT: .Lcfi102: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi123: +; NoVLX-NEXT: .Lcfi103: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -2386,9 +2306,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -2427,12 +2347,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi124: +; NoVLX-NEXT: .Lcfi104: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi125: +; NoVLX-NEXT: .Lcfi105: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi126: +; NoVLX-NEXT: .Lcfi106: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -2441,15 +2361,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi127: +; NoVLX-NEXT: .Lcfi107: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi128: +; NoVLX-NEXT: .Lcfi108: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi129: +; NoVLX-NEXT: .Lcfi109: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi130: +; NoVLX-NEXT: .Lcfi110: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi131: +; NoVLX-NEXT: .Lcfi111: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -2516,9 +2436,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -2558,30 +2478,15 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi132: +; NoVLX-NEXT: .Lcfi112: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi133: +; NoVLX-NEXT: .Lcfi113: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi134: +; NoVLX-NEXT: .Lcfi114: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi135: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi136: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi137: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi138: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi139: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2592,64 +2497,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -2661,12 +2566,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -2689,30 +2589,15 @@ ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi140: +; NoVLX-NEXT: .Lcfi115: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi141: +; NoVLX-NEXT: .Lcfi116: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi142: +; NoVLX-NEXT: .Lcfi117: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi143: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi144: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi145: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi146: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi147: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2723,64 +2608,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -2792,12 +2677,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -2822,30 +2702,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi148: +; NoVLX-NEXT: .Lcfi118: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi149: +; NoVLX-NEXT: .Lcfi119: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi150: +; NoVLX-NEXT: .Lcfi120: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi151: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi152: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi153: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi154: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi155: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2857,64 +2722,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -2926,12 +2791,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -2957,30 +2817,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi156: +; NoVLX-NEXT: .Lcfi121: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi157: +; NoVLX-NEXT: .Lcfi122: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi158: +; NoVLX-NEXT: .Lcfi123: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi159: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi160: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi161: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi162: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi163: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -2992,64 +2837,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -3061,12 +2906,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -3093,323 +2933,323 @@ ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi164: +; NoVLX-NEXT: .Lcfi124: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi165: +; NoVLX-NEXT: .Lcfi125: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi166: +; NoVLX-NEXT: .Lcfi126: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rdx, %rcx +; NoVLX-NEXT: shrq $48, %rdx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: movl %eax, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rdx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movl %edx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vmovd %edx, %xmm5 ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: movq %rdx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $3, %edx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $6, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vpextrq $1, %xmm6, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm6, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm1, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm8 +; NoVLX-NEXT: vmovd %eax, %xmm6 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx ; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rdx ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm6, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm3, %ymm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -3444,69 +3284,68 @@ ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi167: +; NoVLX-NEXT: .Lcfi127: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi168: +; NoVLX-NEXT: .Lcfi128: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi169: +; NoVLX-NEXT: .Lcfi129: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -3514,7 +3353,8 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax @@ -3524,92 +3364,92 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm0 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -3712,12 +3552,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi170: +; NoVLX-NEXT: .Lcfi130: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi171: +; NoVLX-NEXT: .Lcfi131: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi172: +; NoVLX-NEXT: .Lcfi132: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -3728,17 +3568,12 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -3746,9 +3581,10 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 @@ -3766,39 +3602,40 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -3806,99 +3643,102 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm6 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm3 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6 +; NoVLX-NEXT: vpmovdb %zmm5, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm4, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 @@ -4075,12 +3915,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi173: +; NoVLX-NEXT: .Lcfi133: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi174: +; NoVLX-NEXT: .Lcfi134: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi175: +; NoVLX-NEXT: .Lcfi135: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -4092,8 +3932,6 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -4106,19 +3944,20 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -4126,6 +3965,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 @@ -4148,174 +3988,174 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm4, %xmm4 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm3, %ymm3 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm4, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -4356,8 +4196,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4411,8 +4251,8 @@ ; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4468,7 +4308,6 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -4481,13 +4320,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4545,7 +4385,6 @@ ; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -4558,13 +4397,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4624,8 +4464,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4683,7 +4523,6 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -4696,13 +4535,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -4762,8 +4602,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -4816,8 +4656,8 @@ ; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -4872,7 +4712,6 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -4885,13 +4724,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -4948,7 +4788,6 @@ ; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -4961,13 +4800,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -5026,8 +4866,8 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -5084,7 +4924,6 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -5097,13 +4936,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -5159,12 +4999,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi176: +; NoVLX-NEXT: .Lcfi136: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi177: +; NoVLX-NEXT: .Lcfi137: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi178: +; NoVLX-NEXT: .Lcfi138: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5202,12 +5042,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi179: +; NoVLX-NEXT: .Lcfi139: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi180: +; NoVLX-NEXT: .Lcfi140: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi181: +; NoVLX-NEXT: .Lcfi141: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5247,19 +5087,18 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi182: +; NoVLX-NEXT: .Lcfi142: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi183: +; NoVLX-NEXT: .Lcfi143: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi184: +; NoVLX-NEXT: .Lcfi144: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -5272,7 +5111,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -5312,19 +5152,18 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi185: +; NoVLX-NEXT: .Lcfi145: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi186: +; NoVLX-NEXT: .Lcfi146: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi187: +; NoVLX-NEXT: .Lcfi147: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -5337,7 +5176,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -5378,12 +5218,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi188: +; NoVLX-NEXT: .Lcfi148: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi189: +; NoVLX-NEXT: .Lcfi149: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi190: +; NoVLX-NEXT: .Lcfi150: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5425,12 +5265,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi191: +; NoVLX-NEXT: .Lcfi151: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi192: +; NoVLX-NEXT: .Lcfi152: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi193: +; NoVLX-NEXT: .Lcfi153: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -5438,7 +5278,6 @@ ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -5451,7 +5290,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -5493,12 +5333,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi194: +; NoVLX-NEXT: .Lcfi154: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi195: +; NoVLX-NEXT: .Lcfi155: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi196: +; NoVLX-NEXT: .Lcfi156: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -5542,12 +5382,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi197: +; NoVLX-NEXT: .Lcfi157: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi198: +; NoVLX-NEXT: .Lcfi158: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi199: +; NoVLX-NEXT: .Lcfi159: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -5593,16 +5433,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi200: +; NoVLX-NEXT: .Lcfi160: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi201: +; NoVLX-NEXT: .Lcfi161: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi202: +; NoVLX-NEXT: .Lcfi162: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -5614,13 +5453,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -5664,12 +5504,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi203: +; NoVLX-NEXT: .Lcfi163: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi204: +; NoVLX-NEXT: .Lcfi164: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi205: +; NoVLX-NEXT: .Lcfi165: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -5736,12 +5576,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi206: +; NoVLX-NEXT: .Lcfi166: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi207: +; NoVLX-NEXT: .Lcfi167: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi208: +; NoVLX-NEXT: .Lcfi168: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -5789,17 +5629,16 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi209: +; NoVLX-NEXT: .Lcfi169: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi210: +; NoVLX-NEXT: .Lcfi170: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi211: +; NoVLX-NEXT: .Lcfi171: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -5811,13 +5650,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -6052,12 +5892,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi212: +; NoVLX-NEXT: .Lcfi172: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi213: +; NoVLX-NEXT: .Lcfi173: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi214: +; NoVLX-NEXT: .Lcfi174: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6068,35 +5908,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6127,12 +5967,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi215: +; NoVLX-NEXT: .Lcfi175: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi216: +; NoVLX-NEXT: .Lcfi176: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi217: +; NoVLX-NEXT: .Lcfi177: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6143,35 +5983,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6204,12 +6044,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi218: +; NoVLX-NEXT: .Lcfi178: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi219: +; NoVLX-NEXT: .Lcfi179: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi220: +; NoVLX-NEXT: .Lcfi180: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6222,35 +6062,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6284,12 +6124,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi221: +; NoVLX-NEXT: .Lcfi181: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi222: +; NoVLX-NEXT: .Lcfi182: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi223: +; NoVLX-NEXT: .Lcfi183: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6302,35 +6142,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6365,12 +6205,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi224: +; NoVLX-NEXT: .Lcfi184: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi225: +; NoVLX-NEXT: .Lcfi185: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi226: +; NoVLX-NEXT: .Lcfi186: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6381,35 +6221,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6443,12 +6283,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi227: +; NoVLX-NEXT: .Lcfi187: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi228: +; NoVLX-NEXT: .Lcfi188: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi229: +; NoVLX-NEXT: .Lcfi189: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -6461,35 +6301,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -6525,12 +6365,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi230: +; NoVLX-NEXT: .Lcfi190: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi231: +; NoVLX-NEXT: .Lcfi191: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi232: +; NoVLX-NEXT: .Lcfi192: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6605,12 +6445,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi233: +; NoVLX-NEXT: .Lcfi193: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi234: +; NoVLX-NEXT: .Lcfi194: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi235: +; NoVLX-NEXT: .Lcfi195: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6687,12 +6527,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi236: +; NoVLX-NEXT: .Lcfi196: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi237: +; NoVLX-NEXT: .Lcfi197: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi238: +; NoVLX-NEXT: .Lcfi198: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6772,12 +6612,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi239: +; NoVLX-NEXT: .Lcfi199: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi240: +; NoVLX-NEXT: .Lcfi200: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi241: +; NoVLX-NEXT: .Lcfi201: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6858,12 +6698,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi242: +; NoVLX-NEXT: .Lcfi202: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi243: +; NoVLX-NEXT: .Lcfi203: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi244: +; NoVLX-NEXT: .Lcfi204: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -6941,12 +6781,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi245: +; NoVLX-NEXT: .Lcfi205: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi246: +; NoVLX-NEXT: .Lcfi206: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi247: +; NoVLX-NEXT: .Lcfi207: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -7028,12 +6868,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi248: +; NoVLX-NEXT: .Lcfi208: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi249: +; NoVLX-NEXT: .Lcfi209: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi250: +; NoVLX-NEXT: .Lcfi210: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7042,15 +6882,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi251: +; NoVLX-NEXT: .Lcfi211: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi252: +; NoVLX-NEXT: .Lcfi212: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi253: +; NoVLX-NEXT: .Lcfi213: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi254: +; NoVLX-NEXT: .Lcfi214: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi255: +; NoVLX-NEXT: .Lcfi215: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -7113,9 +6953,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7151,12 +6991,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi256: +; NoVLX-NEXT: .Lcfi216: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi257: +; NoVLX-NEXT: .Lcfi217: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi258: +; NoVLX-NEXT: .Lcfi218: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7165,15 +7005,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi259: +; NoVLX-NEXT: .Lcfi219: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi260: +; NoVLX-NEXT: .Lcfi220: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi261: +; NoVLX-NEXT: .Lcfi221: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi262: +; NoVLX-NEXT: .Lcfi222: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi263: +; NoVLX-NEXT: .Lcfi223: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -7236,9 +7076,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7276,12 +7116,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi264: +; NoVLX-NEXT: .Lcfi224: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi265: +; NoVLX-NEXT: .Lcfi225: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi266: +; NoVLX-NEXT: .Lcfi226: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7290,15 +7130,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi267: +; NoVLX-NEXT: .Lcfi227: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi268: +; NoVLX-NEXT: .Lcfi228: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi269: +; NoVLX-NEXT: .Lcfi229: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi270: +; NoVLX-NEXT: .Lcfi230: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi271: +; NoVLX-NEXT: .Lcfi231: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} @@ -7362,9 +7202,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7403,12 +7243,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi272: +; NoVLX-NEXT: .Lcfi232: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi273: +; NoVLX-NEXT: .Lcfi233: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi274: +; NoVLX-NEXT: .Lcfi234: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7417,15 +7257,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi275: +; NoVLX-NEXT: .Lcfi235: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi276: +; NoVLX-NEXT: .Lcfi236: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi277: +; NoVLX-NEXT: .Lcfi237: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi278: +; NoVLX-NEXT: .Lcfi238: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi279: +; NoVLX-NEXT: .Lcfi239: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} @@ -7489,9 +7329,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7531,12 +7371,12 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi280: +; NoVLX-NEXT: .Lcfi240: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi281: +; NoVLX-NEXT: .Lcfi241: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi282: +; NoVLX-NEXT: .Lcfi242: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7545,15 +7385,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi283: +; NoVLX-NEXT: .Lcfi243: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi284: +; NoVLX-NEXT: .Lcfi244: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi285: +; NoVLX-NEXT: .Lcfi245: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi286: +; NoVLX-NEXT: .Lcfi246: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi287: +; NoVLX-NEXT: .Lcfi247: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -7616,9 +7456,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7657,12 +7497,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi288: +; NoVLX-NEXT: .Lcfi248: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi289: +; NoVLX-NEXT: .Lcfi249: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi290: +; NoVLX-NEXT: .Lcfi250: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -7671,15 +7511,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi291: +; NoVLX-NEXT: .Lcfi251: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi292: +; NoVLX-NEXT: .Lcfi252: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi293: +; NoVLX-NEXT: .Lcfi253: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi294: +; NoVLX-NEXT: .Lcfi254: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi295: +; NoVLX-NEXT: .Lcfi255: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} @@ -7743,9 +7583,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -7786,30 +7626,15 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi296: +; NoVLX-NEXT: .Lcfi256: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi297: +; NoVLX-NEXT: .Lcfi257: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi298: +; NoVLX-NEXT: .Lcfi258: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi299: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi300: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi301: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi302: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi303: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -7817,64 +7642,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -7886,12 +7711,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -7914,30 +7734,15 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi304: +; NoVLX-NEXT: .Lcfi259: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi305: +; NoVLX-NEXT: .Lcfi260: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi306: +; NoVLX-NEXT: .Lcfi261: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi307: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi308: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi309: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi310: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi311: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -7945,64 +7750,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -8014,12 +7819,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -8044,30 +7844,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi312: +; NoVLX-NEXT: .Lcfi262: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi313: +; NoVLX-NEXT: .Lcfi263: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi314: +; NoVLX-NEXT: .Lcfi264: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi315: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi316: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi317: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi318: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi319: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -8076,64 +7861,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -8145,12 +7930,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -8176,30 +7956,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi320: +; NoVLX-NEXT: .Lcfi265: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi321: +; NoVLX-NEXT: .Lcfi266: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi322: +; NoVLX-NEXT: .Lcfi267: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi323: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi324: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi325: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi326: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi327: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -8208,64 +7973,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -8277,12 +8042,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -8309,30 +8069,15 @@ ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi328: +; NoVLX-NEXT: .Lcfi268: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi329: +; NoVLX-NEXT: .Lcfi269: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi330: +; NoVLX-NEXT: .Lcfi270: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi331: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi332: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi333: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi334: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi335: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -8340,64 +8085,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -8409,12 +8154,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -8440,30 +8180,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi336: +; NoVLX-NEXT: .Lcfi271: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi337: +; NoVLX-NEXT: .Lcfi272: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi338: +; NoVLX-NEXT: .Lcfi273: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi339: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi340: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi341: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi342: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi343: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -8472,64 +8197,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -8541,12 +8266,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -8629,7 +8349,6 @@ ; ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: ; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -8637,9 +8356,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8744,7 +8464,6 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -8752,9 +8471,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -8790,8 +8510,8 @@ ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -8799,9 +8519,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -8829,8 +8549,8 @@ ; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -8838,9 +8558,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -8880,8 +8600,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -8889,9 +8609,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -8933,8 +8653,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -8942,9 +8662,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -8978,8 +8698,8 @@ ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -8987,9 +8707,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9031,8 +8751,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -9040,9 +8760,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9076,17 +8796,17 @@ ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9114,17 +8834,17 @@ ; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9164,17 +8884,17 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9216,17 +8936,17 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9260,17 +8980,17 @@ ; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9312,17 +9032,17 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -9353,12 +9073,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi344: +; NoVLX-NEXT: .Lcfi274: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi345: +; NoVLX-NEXT: .Lcfi275: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi346: +; NoVLX-NEXT: .Lcfi276: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9396,12 +9116,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi347: +; NoVLX-NEXT: .Lcfi277: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi348: +; NoVLX-NEXT: .Lcfi278: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi349: +; NoVLX-NEXT: .Lcfi279: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9441,12 +9161,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi350: +; NoVLX-NEXT: .Lcfi280: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi351: +; NoVLX-NEXT: .Lcfi281: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi352: +; NoVLX-NEXT: .Lcfi282: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9498,12 +9218,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi353: +; NoVLX-NEXT: .Lcfi283: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi354: +; NoVLX-NEXT: .Lcfi284: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi355: +; NoVLX-NEXT: .Lcfi285: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9556,12 +9276,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi356: +; NoVLX-NEXT: .Lcfi286: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi357: +; NoVLX-NEXT: .Lcfi287: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi358: +; NoVLX-NEXT: .Lcfi288: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9603,12 +9323,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi359: +; NoVLX-NEXT: .Lcfi289: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi360: +; NoVLX-NEXT: .Lcfi290: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi361: +; NoVLX-NEXT: .Lcfi291: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -9663,12 +9383,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi362: +; NoVLX-NEXT: .Lcfi292: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi363: +; NoVLX-NEXT: .Lcfi293: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi364: +; NoVLX-NEXT: .Lcfi294: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9712,12 +9432,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi365: +; NoVLX-NEXT: .Lcfi295: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi366: +; NoVLX-NEXT: .Lcfi296: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi367: +; NoVLX-NEXT: .Lcfi297: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9763,16 +9483,15 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi368: +; NoVLX-NEXT: .Lcfi298: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi369: +; NoVLX-NEXT: .Lcfi299: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi370: +; NoVLX-NEXT: .Lcfi300: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9780,9 +9499,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -9826,12 +9546,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi371: +; NoVLX-NEXT: .Lcfi301: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi372: +; NoVLX-NEXT: .Lcfi302: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi373: +; NoVLX-NEXT: .Lcfi303: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9890,12 +9610,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi374: +; NoVLX-NEXT: .Lcfi304: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi375: +; NoVLX-NEXT: .Lcfi305: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi376: +; NoVLX-NEXT: .Lcfi306: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -9943,17 +9663,16 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi377: +; NoVLX-NEXT: .Lcfi307: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi378: +; NoVLX-NEXT: .Lcfi308: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi379: +; NoVLX-NEXT: .Lcfi309: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -9961,9 +9680,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -10014,8 +9734,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10071,8 +9791,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10130,7 +9850,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -10143,13 +9862,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10209,7 +9929,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -10222,13 +9941,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10290,8 +10010,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10351,7 +10071,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -10364,13 +10083,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -10432,8 +10152,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -10488,8 +10208,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -10546,7 +10266,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -10559,13 +10278,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -10624,7 +10344,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -10637,13 +10356,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -10704,8 +10424,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -10764,7 +10484,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -10777,13 +10496,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -10840,12 +10560,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi380: +; NoVLX-NEXT: .Lcfi310: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi381: +; NoVLX-NEXT: .Lcfi311: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi382: +; NoVLX-NEXT: .Lcfi312: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10885,12 +10605,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi383: +; NoVLX-NEXT: .Lcfi313: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi384: +; NoVLX-NEXT: .Lcfi314: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi385: +; NoVLX-NEXT: .Lcfi315: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10932,12 +10652,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi386: +; NoVLX-NEXT: .Lcfi316: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi387: +; NoVLX-NEXT: .Lcfi317: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi388: +; NoVLX-NEXT: .Lcfi318: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -10945,7 +10665,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -10958,7 +10677,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -10999,12 +10719,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi389: +; NoVLX-NEXT: .Lcfi319: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi390: +; NoVLX-NEXT: .Lcfi320: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi391: +; NoVLX-NEXT: .Lcfi321: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11012,7 +10732,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -11025,7 +10744,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -11067,12 +10787,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi392: +; NoVLX-NEXT: .Lcfi322: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi393: +; NoVLX-NEXT: .Lcfi323: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi394: +; NoVLX-NEXT: .Lcfi324: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11116,12 +10836,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi395: +; NoVLX-NEXT: .Lcfi325: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi396: +; NoVLX-NEXT: .Lcfi326: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi397: +; NoVLX-NEXT: .Lcfi327: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11130,7 +10850,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -11143,7 +10862,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -11186,12 +10906,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi398: +; NoVLX-NEXT: .Lcfi328: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi399: +; NoVLX-NEXT: .Lcfi329: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi400: +; NoVLX-NEXT: .Lcfi330: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11237,12 +10957,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi401: +; NoVLX-NEXT: .Lcfi331: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi402: +; NoVLX-NEXT: .Lcfi332: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi403: +; NoVLX-NEXT: .Lcfi333: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11290,17 +11010,16 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi404: +; NoVLX-NEXT: .Lcfi334: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi405: +; NoVLX-NEXT: .Lcfi335: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi406: +; NoVLX-NEXT: .Lcfi336: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -11318,6 +11037,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -11363,17 +11083,16 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi407: +; NoVLX-NEXT: .Lcfi337: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi408: +; NoVLX-NEXT: .Lcfi338: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi409: +; NoVLX-NEXT: .Lcfi339: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -11391,6 +11110,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -11437,12 +11157,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi410: +; NoVLX-NEXT: .Lcfi340: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi411: +; NoVLX-NEXT: .Lcfi341: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi412: +; NoVLX-NEXT: .Lcfi342: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -11492,18 +11212,17 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi413: +; NoVLX-NEXT: .Lcfi343: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi414: +; NoVLX-NEXT: .Lcfi344: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi415: +; NoVLX-NEXT: .Lcfi345: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -11521,6 +11240,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -11732,12 +11452,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi416: +; NoVLX-NEXT: .Lcfi346: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi417: +; NoVLX-NEXT: .Lcfi347: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi418: +; NoVLX-NEXT: .Lcfi348: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11746,35 +11466,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -11805,12 +11525,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi419: +; NoVLX-NEXT: .Lcfi349: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi420: +; NoVLX-NEXT: .Lcfi350: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi421: +; NoVLX-NEXT: .Lcfi351: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11819,35 +11539,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -11880,12 +11600,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi422: +; NoVLX-NEXT: .Lcfi352: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi423: +; NoVLX-NEXT: .Lcfi353: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi424: +; NoVLX-NEXT: .Lcfi354: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11895,35 +11615,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -11957,12 +11677,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi425: +; NoVLX-NEXT: .Lcfi355: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi426: +; NoVLX-NEXT: .Lcfi356: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi427: +; NoVLX-NEXT: .Lcfi357: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -11972,35 +11692,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12035,12 +11755,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi428: +; NoVLX-NEXT: .Lcfi358: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi429: +; NoVLX-NEXT: .Lcfi359: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi430: +; NoVLX-NEXT: .Lcfi360: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -12049,35 +11769,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12111,12 +11831,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi431: +; NoVLX-NEXT: .Lcfi361: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi432: +; NoVLX-NEXT: .Lcfi362: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi433: +; NoVLX-NEXT: .Lcfi363: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -12126,35 +11846,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12190,12 +11910,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi434: +; NoVLX-NEXT: .Lcfi364: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi435: +; NoVLX-NEXT: .Lcfi365: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi436: +; NoVLX-NEXT: .Lcfi366: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -12268,12 +11988,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi437: +; NoVLX-NEXT: .Lcfi367: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi438: +; NoVLX-NEXT: .Lcfi368: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi439: +; NoVLX-NEXT: .Lcfi369: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -12348,12 +12068,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi440: +; NoVLX-NEXT: .Lcfi370: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi441: +; NoVLX-NEXT: .Lcfi371: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi442: +; NoVLX-NEXT: .Lcfi372: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -12430,12 +12150,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi443: +; NoVLX-NEXT: .Lcfi373: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi444: +; NoVLX-NEXT: .Lcfi374: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi445: +; NoVLX-NEXT: .Lcfi375: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -12513,12 +12233,12 @@ ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi446: +; NoVLX-NEXT: .Lcfi376: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi447: +; NoVLX-NEXT: .Lcfi377: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi448: +; NoVLX-NEXT: .Lcfi378: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -12594,12 +12314,12 @@ ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi449: +; NoVLX-NEXT: .Lcfi379: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi450: +; NoVLX-NEXT: .Lcfi380: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi451: +; NoVLX-NEXT: .Lcfi381: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -12677,12 +12397,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi452: +; NoVLX-NEXT: .Lcfi382: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi453: +; NoVLX-NEXT: .Lcfi383: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi454: +; NoVLX-NEXT: .Lcfi384: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -12691,15 +12411,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi455: +; NoVLX-NEXT: .Lcfi385: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi456: +; NoVLX-NEXT: .Lcfi386: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi457: +; NoVLX-NEXT: .Lcfi387: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi458: +; NoVLX-NEXT: .Lcfi388: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi459: +; NoVLX-NEXT: .Lcfi389: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12765,9 +12485,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12802,12 +12522,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi460: +; NoVLX-NEXT: .Lcfi390: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi461: +; NoVLX-NEXT: .Lcfi391: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi462: +; NoVLX-NEXT: .Lcfi392: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -12816,15 +12536,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi463: +; NoVLX-NEXT: .Lcfi393: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi464: +; NoVLX-NEXT: .Lcfi394: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi465: +; NoVLX-NEXT: .Lcfi395: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi466: +; NoVLX-NEXT: .Lcfi396: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi467: +; NoVLX-NEXT: .Lcfi397: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12890,9 +12610,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -12929,12 +12649,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi468: +; NoVLX-NEXT: .Lcfi398: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi469: +; NoVLX-NEXT: .Lcfi399: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi470: +; NoVLX-NEXT: .Lcfi400: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -12943,15 +12663,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi471: +; NoVLX-NEXT: .Lcfi401: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi472: +; NoVLX-NEXT: .Lcfi402: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi473: +; NoVLX-NEXT: .Lcfi403: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi474: +; NoVLX-NEXT: .Lcfi404: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi475: +; NoVLX-NEXT: .Lcfi405: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -13018,9 +12738,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -13058,12 +12778,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi476: +; NoVLX-NEXT: .Lcfi406: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi477: +; NoVLX-NEXT: .Lcfi407: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi478: +; NoVLX-NEXT: .Lcfi408: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -13072,15 +12792,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi479: +; NoVLX-NEXT: .Lcfi409: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi480: +; NoVLX-NEXT: .Lcfi410: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi481: +; NoVLX-NEXT: .Lcfi411: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi482: +; NoVLX-NEXT: .Lcfi412: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi483: +; NoVLX-NEXT: .Lcfi413: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -13147,9 +12867,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -13188,30 +12908,15 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi484: +; NoVLX-NEXT: .Lcfi414: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi485: +; NoVLX-NEXT: .Lcfi415: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi486: +; NoVLX-NEXT: .Lcfi416: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi487: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi488: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi489: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi490: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi491: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -13222,64 +12927,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -13291,12 +12996,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -13318,30 +13018,15 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi492: +; NoVLX-NEXT: .Lcfi417: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi493: +; NoVLX-NEXT: .Lcfi418: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi494: +; NoVLX-NEXT: .Lcfi419: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi495: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi496: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi497: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi498: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi499: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -13352,64 +13037,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -13421,12 +13106,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -13450,30 +13130,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi500: +; NoVLX-NEXT: .Lcfi420: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi501: +; NoVLX-NEXT: .Lcfi421: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi502: +; NoVLX-NEXT: .Lcfi422: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi503: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi504: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi505: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi506: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi507: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -13485,64 +13150,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -13554,12 +13219,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -13584,30 +13244,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi508: +; NoVLX-NEXT: .Lcfi423: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi509: +; NoVLX-NEXT: .Lcfi424: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi510: +; NoVLX-NEXT: .Lcfi425: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi511: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi512: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi513: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi514: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi515: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -13619,64 +13264,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -13688,12 +13333,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -13720,12 +13360,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi516: +; NoVLX-NEXT: .Lcfi426: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi517: +; NoVLX-NEXT: .Lcfi427: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi518: +; NoVLX-NEXT: .Lcfi428: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -13769,12 +13409,12 @@ ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi519: +; NoVLX-NEXT: .Lcfi429: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi520: +; NoVLX-NEXT: .Lcfi430: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi521: +; NoVLX-NEXT: .Lcfi431: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -13820,12 +13460,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi522: +; NoVLX-NEXT: .Lcfi432: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi523: +; NoVLX-NEXT: .Lcfi433: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi524: +; NoVLX-NEXT: .Lcfi434: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -13881,12 +13521,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi525: +; NoVLX-NEXT: .Lcfi435: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi526: +; NoVLX-NEXT: .Lcfi436: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi527: +; NoVLX-NEXT: .Lcfi437: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -14065,12 +13705,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi528: +; NoVLX-NEXT: .Lcfi438: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi529: +; NoVLX-NEXT: .Lcfi439: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi530: +; NoVLX-NEXT: .Lcfi440: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -14082,35 +13722,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -14140,12 +13780,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi531: +; NoVLX-NEXT: .Lcfi441: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi532: +; NoVLX-NEXT: .Lcfi442: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi533: +; NoVLX-NEXT: .Lcfi443: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -14157,35 +13797,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -14217,12 +13857,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi534: +; NoVLX-NEXT: .Lcfi444: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi535: +; NoVLX-NEXT: .Lcfi445: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi536: +; NoVLX-NEXT: .Lcfi446: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -14235,35 +13875,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -14296,12 +13936,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi537: +; NoVLX-NEXT: .Lcfi447: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi538: +; NoVLX-NEXT: .Lcfi448: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi539: +; NoVLX-NEXT: .Lcfi449: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -14314,35 +13954,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -14376,12 +14016,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi540: +; NoVLX-NEXT: .Lcfi450: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi541: +; NoVLX-NEXT: .Lcfi451: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi542: +; NoVLX-NEXT: .Lcfi452: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14456,12 +14096,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi543: +; NoVLX-NEXT: .Lcfi453: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi544: +; NoVLX-NEXT: .Lcfi454: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi545: +; NoVLX-NEXT: .Lcfi455: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14538,12 +14178,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi546: +; NoVLX-NEXT: .Lcfi456: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi547: +; NoVLX-NEXT: .Lcfi457: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi548: +; NoVLX-NEXT: .Lcfi458: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14622,12 +14262,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi549: +; NoVLX-NEXT: .Lcfi459: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi550: +; NoVLX-NEXT: .Lcfi460: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi551: +; NoVLX-NEXT: .Lcfi461: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -14708,12 +14348,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi552: +; NoVLX-NEXT: .Lcfi462: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi553: +; NoVLX-NEXT: .Lcfi463: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi554: +; NoVLX-NEXT: .Lcfi464: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -14722,15 +14362,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi555: +; NoVLX-NEXT: .Lcfi465: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi556: +; NoVLX-NEXT: .Lcfi466: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi557: +; NoVLX-NEXT: .Lcfi467: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi558: +; NoVLX-NEXT: .Lcfi468: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi559: +; NoVLX-NEXT: .Lcfi469: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -14796,9 +14436,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -14834,12 +14474,12 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi560: +; NoVLX-NEXT: .Lcfi470: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi561: +; NoVLX-NEXT: .Lcfi471: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi562: +; NoVLX-NEXT: .Lcfi472: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -14848,15 +14488,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi563: +; NoVLX-NEXT: .Lcfi473: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi564: +; NoVLX-NEXT: .Lcfi474: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi565: +; NoVLX-NEXT: .Lcfi475: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi566: +; NoVLX-NEXT: .Lcfi476: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi567: +; NoVLX-NEXT: .Lcfi477: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -14922,9 +14562,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -14962,12 +14602,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi568: +; NoVLX-NEXT: .Lcfi478: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi569: +; NoVLX-NEXT: .Lcfi479: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi570: +; NoVLX-NEXT: .Lcfi480: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -14976,15 +14616,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi571: +; NoVLX-NEXT: .Lcfi481: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi572: +; NoVLX-NEXT: .Lcfi482: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi573: +; NoVLX-NEXT: .Lcfi483: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi574: +; NoVLX-NEXT: .Lcfi484: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi575: +; NoVLX-NEXT: .Lcfi485: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -15051,9 +14691,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15092,12 +14732,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi576: +; NoVLX-NEXT: .Lcfi486: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi577: +; NoVLX-NEXT: .Lcfi487: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi578: +; NoVLX-NEXT: .Lcfi488: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -15106,15 +14746,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi579: +; NoVLX-NEXT: .Lcfi489: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi580: +; NoVLX-NEXT: .Lcfi490: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi581: +; NoVLX-NEXT: .Lcfi491: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi582: +; NoVLX-NEXT: .Lcfi492: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi583: +; NoVLX-NEXT: .Lcfi493: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -15181,9 +14821,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -15223,30 +14863,15 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi584: +; NoVLX-NEXT: .Lcfi494: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi585: +; NoVLX-NEXT: .Lcfi495: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi586: +; NoVLX-NEXT: .Lcfi496: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi587: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi588: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi589: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi590: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi591: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -15257,64 +14882,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -15326,12 +14951,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -15354,30 +14974,15 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi592: +; NoVLX-NEXT: .Lcfi497: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi593: +; NoVLX-NEXT: .Lcfi498: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi594: +; NoVLX-NEXT: .Lcfi499: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi595: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi596: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi597: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi598: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi599: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -15388,64 +14993,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -15457,12 +15062,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -15487,30 +15087,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi600: +; NoVLX-NEXT: .Lcfi500: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi601: +; NoVLX-NEXT: .Lcfi501: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi602: +; NoVLX-NEXT: .Lcfi502: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi603: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi604: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi605: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi606: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi607: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -15522,64 +15107,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -15591,12 +15176,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -15622,30 +15202,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi608: +; NoVLX-NEXT: .Lcfi503: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi609: +; NoVLX-NEXT: .Lcfi504: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi610: +; NoVLX-NEXT: .Lcfi505: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi611: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi612: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi613: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi614: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi615: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -15657,64 +15222,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -15726,12 +15291,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -15758,323 +15318,323 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi616: +; NoVLX-NEXT: .Lcfi506: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi617: +; NoVLX-NEXT: .Lcfi507: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi618: +; NoVLX-NEXT: .Lcfi508: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rdx, %rcx +; NoVLX-NEXT: shrq $48, %rdx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: movl %eax, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $6, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rdx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movl %edx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vmovd %edx, %xmm5 ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: movq %rdx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $3, %edx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $6, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: vpextrq $1, %xmm6, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm6, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm1, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm8 +; NoVLX-NEXT: vmovd %eax, %xmm6 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx ; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rdx ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm6, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm3, %ymm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -16109,69 +15669,68 @@ ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi619: +; NoVLX-NEXT: .Lcfi509: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi620: +; NoVLX-NEXT: .Lcfi510: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi621: +; NoVLX-NEXT: .Lcfi511: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -16179,7 +15738,8 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax @@ -16189,92 +15749,92 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1 -; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm0 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 -; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -16377,12 +15937,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi622: +; NoVLX-NEXT: .Lcfi512: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi623: +; NoVLX-NEXT: .Lcfi513: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi624: +; NoVLX-NEXT: .Lcfi514: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -16393,17 +15953,12 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -16411,9 +15966,10 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 @@ -16431,39 +15987,40 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -16471,99 +16028,102 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 +; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm6 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm3 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6 +; NoVLX-NEXT: vpmovdb %zmm5, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 @@ -16740,12 +16300,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi625: +; NoVLX-NEXT: .Lcfi515: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi626: +; NoVLX-NEXT: .Lcfi516: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi627: +; NoVLX-NEXT: .Lcfi517: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -16757,8 +16317,6 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -16771,19 +16329,20 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -16791,6 +16350,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 @@ -16813,174 +16373,174 @@ ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm4 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm3 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovdb %zmm4, %xmm4 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm3, %ymm3 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm4, %xmm1, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -17021,8 +16581,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -17076,8 +16636,8 @@ ; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -17133,7 +16693,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17146,13 +16705,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -17210,7 +16770,6 @@ ; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17223,13 +16782,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -17289,8 +16849,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -17348,7 +16908,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17361,13 +16920,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -17427,8 +16987,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -17481,8 +17041,8 @@ ; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -17537,7 +17097,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17550,13 +17109,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -17613,7 +17173,6 @@ ; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17626,13 +17185,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -17691,8 +17251,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -17749,7 +17309,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17762,13 +17321,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -17824,12 +17384,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi628: +; NoVLX-NEXT: .Lcfi518: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi629: +; NoVLX-NEXT: .Lcfi519: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi630: +; NoVLX-NEXT: .Lcfi520: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17867,12 +17427,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi631: +; NoVLX-NEXT: .Lcfi521: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi632: +; NoVLX-NEXT: .Lcfi522: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi633: +; NoVLX-NEXT: .Lcfi523: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -17912,19 +17472,18 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi634: +; NoVLX-NEXT: .Lcfi524: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi635: +; NoVLX-NEXT: .Lcfi525: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi636: +; NoVLX-NEXT: .Lcfi526: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -17937,7 +17496,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -17977,19 +17537,18 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi637: +; NoVLX-NEXT: .Lcfi527: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi638: +; NoVLX-NEXT: .Lcfi528: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi639: +; NoVLX-NEXT: .Lcfi529: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -18002,7 +17561,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -18043,12 +17603,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi640: +; NoVLX-NEXT: .Lcfi530: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi641: +; NoVLX-NEXT: .Lcfi531: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi642: +; NoVLX-NEXT: .Lcfi532: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18090,12 +17650,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi643: +; NoVLX-NEXT: .Lcfi533: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi644: +; NoVLX-NEXT: .Lcfi534: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi645: +; NoVLX-NEXT: .Lcfi535: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18103,7 +17663,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -18116,7 +17675,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -18158,12 +17718,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi646: +; NoVLX-NEXT: .Lcfi536: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi647: +; NoVLX-NEXT: .Lcfi537: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi648: +; NoVLX-NEXT: .Lcfi538: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -18207,12 +17767,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi649: +; NoVLX-NEXT: .Lcfi539: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi650: +; NoVLX-NEXT: .Lcfi540: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi651: +; NoVLX-NEXT: .Lcfi541: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -18258,16 +17818,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi652: +; NoVLX-NEXT: .Lcfi542: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi653: +; NoVLX-NEXT: .Lcfi543: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi654: +; NoVLX-NEXT: .Lcfi544: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -18279,13 +17838,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -18329,12 +17889,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi655: +; NoVLX-NEXT: .Lcfi545: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi656: +; NoVLX-NEXT: .Lcfi546: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi657: +; NoVLX-NEXT: .Lcfi547: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -18401,12 +17961,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi658: +; NoVLX-NEXT: .Lcfi548: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi659: +; NoVLX-NEXT: .Lcfi549: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi660: +; NoVLX-NEXT: .Lcfi550: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -18454,17 +18014,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi661: +; NoVLX-NEXT: .Lcfi551: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi662: +; NoVLX-NEXT: .Lcfi552: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi663: +; NoVLX-NEXT: .Lcfi553: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -18476,13 +18035,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -18717,12 +18277,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi664: +; NoVLX-NEXT: .Lcfi554: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi665: +; NoVLX-NEXT: .Lcfi555: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi666: +; NoVLX-NEXT: .Lcfi556: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18733,35 +18293,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -18792,12 +18352,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi667: +; NoVLX-NEXT: .Lcfi557: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi668: +; NoVLX-NEXT: .Lcfi558: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi669: +; NoVLX-NEXT: .Lcfi559: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18808,35 +18368,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -18869,12 +18429,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi670: +; NoVLX-NEXT: .Lcfi560: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi671: +; NoVLX-NEXT: .Lcfi561: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi672: +; NoVLX-NEXT: .Lcfi562: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18887,35 +18447,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -18949,12 +18509,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi673: +; NoVLX-NEXT: .Lcfi563: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi674: +; NoVLX-NEXT: .Lcfi564: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi675: +; NoVLX-NEXT: .Lcfi565: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -18967,35 +18527,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -19030,12 +18590,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi676: +; NoVLX-NEXT: .Lcfi566: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi677: +; NoVLX-NEXT: .Lcfi567: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi678: +; NoVLX-NEXT: .Lcfi568: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -19046,35 +18606,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -19108,12 +18668,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi679: +; NoVLX-NEXT: .Lcfi569: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi680: +; NoVLX-NEXT: .Lcfi570: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi681: +; NoVLX-NEXT: .Lcfi571: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -19126,35 +18686,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -19190,12 +18750,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi682: +; NoVLX-NEXT: .Lcfi572: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi683: +; NoVLX-NEXT: .Lcfi573: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi684: +; NoVLX-NEXT: .Lcfi574: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19270,12 +18830,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi685: +; NoVLX-NEXT: .Lcfi575: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi686: +; NoVLX-NEXT: .Lcfi576: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi687: +; NoVLX-NEXT: .Lcfi577: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19352,12 +18912,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi688: +; NoVLX-NEXT: .Lcfi578: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi689: +; NoVLX-NEXT: .Lcfi579: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi690: +; NoVLX-NEXT: .Lcfi580: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19437,12 +18997,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi691: +; NoVLX-NEXT: .Lcfi581: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi692: +; NoVLX-NEXT: .Lcfi582: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi693: +; NoVLX-NEXT: .Lcfi583: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19523,12 +19083,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi694: +; NoVLX-NEXT: .Lcfi584: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi695: +; NoVLX-NEXT: .Lcfi585: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi696: +; NoVLX-NEXT: .Lcfi586: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19606,12 +19166,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi697: +; NoVLX-NEXT: .Lcfi587: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi698: +; NoVLX-NEXT: .Lcfi588: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi699: +; NoVLX-NEXT: .Lcfi589: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -19693,12 +19253,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi700: +; NoVLX-NEXT: .Lcfi590: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi701: +; NoVLX-NEXT: .Lcfi591: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi702: +; NoVLX-NEXT: .Lcfi592: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -19707,15 +19267,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi703: +; NoVLX-NEXT: .Lcfi593: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi704: +; NoVLX-NEXT: .Lcfi594: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi705: +; NoVLX-NEXT: .Lcfi595: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi706: +; NoVLX-NEXT: .Lcfi596: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi707: +; NoVLX-NEXT: .Lcfi597: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -19778,9 +19338,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -19816,12 +19376,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi708: +; NoVLX-NEXT: .Lcfi598: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi709: +; NoVLX-NEXT: .Lcfi599: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi710: +; NoVLX-NEXT: .Lcfi600: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -19830,15 +19390,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi711: +; NoVLX-NEXT: .Lcfi601: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi712: +; NoVLX-NEXT: .Lcfi602: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi713: +; NoVLX-NEXT: .Lcfi603: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi714: +; NoVLX-NEXT: .Lcfi604: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi715: +; NoVLX-NEXT: .Lcfi605: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -19901,9 +19461,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -19941,12 +19501,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi716: +; NoVLX-NEXT: .Lcfi606: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi717: +; NoVLX-NEXT: .Lcfi607: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi718: +; NoVLX-NEXT: .Lcfi608: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -19955,15 +19515,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi719: +; NoVLX-NEXT: .Lcfi609: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi720: +; NoVLX-NEXT: .Lcfi610: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi721: +; NoVLX-NEXT: .Lcfi611: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi722: +; NoVLX-NEXT: .Lcfi612: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi723: +; NoVLX-NEXT: .Lcfi613: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} @@ -20027,9 +19587,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -20068,12 +19628,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi724: +; NoVLX-NEXT: .Lcfi614: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi725: +; NoVLX-NEXT: .Lcfi615: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi726: +; NoVLX-NEXT: .Lcfi616: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20082,15 +19642,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi727: +; NoVLX-NEXT: .Lcfi617: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi728: +; NoVLX-NEXT: .Lcfi618: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi729: +; NoVLX-NEXT: .Lcfi619: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi730: +; NoVLX-NEXT: .Lcfi620: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi731: +; NoVLX-NEXT: .Lcfi621: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} @@ -20154,9 +19714,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -20196,12 +19756,12 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi732: +; NoVLX-NEXT: .Lcfi622: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi733: +; NoVLX-NEXT: .Lcfi623: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi734: +; NoVLX-NEXT: .Lcfi624: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20210,15 +19770,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi735: +; NoVLX-NEXT: .Lcfi625: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi736: +; NoVLX-NEXT: .Lcfi626: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi737: +; NoVLX-NEXT: .Lcfi627: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi738: +; NoVLX-NEXT: .Lcfi628: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi739: +; NoVLX-NEXT: .Lcfi629: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -20281,9 +19841,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -20322,12 +19882,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi740: +; NoVLX-NEXT: .Lcfi630: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi741: +; NoVLX-NEXT: .Lcfi631: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi742: +; NoVLX-NEXT: .Lcfi632: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -20336,15 +19896,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi743: +; NoVLX-NEXT: .Lcfi633: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi744: +; NoVLX-NEXT: .Lcfi634: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi745: +; NoVLX-NEXT: .Lcfi635: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi746: +; NoVLX-NEXT: .Lcfi636: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi747: +; NoVLX-NEXT: .Lcfi637: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} @@ -20408,9 +19968,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -20451,30 +20011,15 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi748: +; NoVLX-NEXT: .Lcfi638: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi749: +; NoVLX-NEXT: .Lcfi639: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi750: +; NoVLX-NEXT: .Lcfi640: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi751: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi752: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi753: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi754: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi755: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -20482,64 +20027,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -20551,12 +20096,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -20579,30 +20119,15 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi756: +; NoVLX-NEXT: .Lcfi641: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi757: +; NoVLX-NEXT: .Lcfi642: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi758: +; NoVLX-NEXT: .Lcfi643: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi759: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi760: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi761: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi762: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi763: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -20610,64 +20135,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -20679,12 +20204,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -20709,30 +20229,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi764: +; NoVLX-NEXT: .Lcfi644: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi765: +; NoVLX-NEXT: .Lcfi645: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi766: +; NoVLX-NEXT: .Lcfi646: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi767: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi768: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi769: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi770: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi771: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -20741,64 +20246,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -20810,12 +20315,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -20841,30 +20341,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi772: +; NoVLX-NEXT: .Lcfi647: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi773: +; NoVLX-NEXT: .Lcfi648: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi774: +; NoVLX-NEXT: .Lcfi649: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi775: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi776: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi777: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi778: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi779: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -20873,64 +20358,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -20942,12 +20427,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -20974,30 +20454,15 @@ ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi780: +; NoVLX-NEXT: .Lcfi650: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi781: +; NoVLX-NEXT: .Lcfi651: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi782: +; NoVLX-NEXT: .Lcfi652: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi783: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi784: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi785: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi786: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi787: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -21005,64 +20470,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -21074,12 +20539,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -21105,30 +20565,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi788: +; NoVLX-NEXT: .Lcfi653: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi789: +; NoVLX-NEXT: .Lcfi654: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi790: +; NoVLX-NEXT: .Lcfi655: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi791: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi792: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi793: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi794: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi795: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -21137,64 +20582,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -21206,12 +20651,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -21294,7 +20734,6 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: ; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -21302,9 +20741,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -21409,7 +20849,6 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -21417,9 +20856,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -21455,8 +20895,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -21464,9 +20904,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21494,8 +20934,8 @@ ; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -21503,9 +20943,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21545,8 +20985,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -21554,9 +20994,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21598,8 +21038,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -21607,9 +21047,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21643,8 +21083,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -21652,9 +21092,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21696,8 +21136,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -21705,9 +21145,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21741,17 +21181,17 @@ ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21779,17 +21219,17 @@ ; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21829,17 +21269,17 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21881,17 +21321,17 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21925,17 +21365,17 @@ ; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -21977,17 +21417,17 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -22018,12 +21458,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi796: +; NoVLX-NEXT: .Lcfi656: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi797: +; NoVLX-NEXT: .Lcfi657: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi798: +; NoVLX-NEXT: .Lcfi658: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22061,12 +21501,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi799: +; NoVLX-NEXT: .Lcfi659: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi800: +; NoVLX-NEXT: .Lcfi660: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi801: +; NoVLX-NEXT: .Lcfi661: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22106,12 +21546,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi802: +; NoVLX-NEXT: .Lcfi662: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi803: +; NoVLX-NEXT: .Lcfi663: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi804: +; NoVLX-NEXT: .Lcfi664: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22163,12 +21603,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi805: +; NoVLX-NEXT: .Lcfi665: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi806: +; NoVLX-NEXT: .Lcfi666: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi807: +; NoVLX-NEXT: .Lcfi667: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22221,12 +21661,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi808: +; NoVLX-NEXT: .Lcfi668: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi809: +; NoVLX-NEXT: .Lcfi669: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi810: +; NoVLX-NEXT: .Lcfi670: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22268,12 +21708,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi811: +; NoVLX-NEXT: .Lcfi671: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi812: +; NoVLX-NEXT: .Lcfi672: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi813: +; NoVLX-NEXT: .Lcfi673: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -22328,12 +21768,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi814: +; NoVLX-NEXT: .Lcfi674: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi815: +; NoVLX-NEXT: .Lcfi675: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi816: +; NoVLX-NEXT: .Lcfi676: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22377,12 +21817,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi817: +; NoVLX-NEXT: .Lcfi677: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi818: +; NoVLX-NEXT: .Lcfi678: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi819: +; NoVLX-NEXT: .Lcfi679: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22428,16 +21868,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi820: +; NoVLX-NEXT: .Lcfi680: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi821: +; NoVLX-NEXT: .Lcfi681: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi822: +; NoVLX-NEXT: .Lcfi682: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22445,9 +21884,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -22491,12 +21931,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi823: +; NoVLX-NEXT: .Lcfi683: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi824: +; NoVLX-NEXT: .Lcfi684: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi825: +; NoVLX-NEXT: .Lcfi685: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22555,12 +21995,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi826: +; NoVLX-NEXT: .Lcfi686: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi827: +; NoVLX-NEXT: .Lcfi687: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi828: +; NoVLX-NEXT: .Lcfi688: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -22608,17 +22048,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi829: +; NoVLX-NEXT: .Lcfi689: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi830: +; NoVLX-NEXT: .Lcfi690: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi831: +; NoVLX-NEXT: .Lcfi691: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -22626,9 +22065,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -22679,8 +22119,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22736,8 +22176,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22795,7 +22235,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -22808,13 +22247,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22874,7 +22314,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -22887,13 +22326,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -22955,8 +22395,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -23016,7 +22456,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23029,13 +22468,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -23097,8 +22537,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -23153,8 +22593,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -23211,7 +22651,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23224,13 +22663,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -23289,7 +22729,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23302,13 +22741,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -23369,8 +22809,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -23429,7 +22869,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23442,13 +22881,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -23505,12 +22945,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi832: +; NoVLX-NEXT: .Lcfi692: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi833: +; NoVLX-NEXT: .Lcfi693: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi834: +; NoVLX-NEXT: .Lcfi694: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23550,12 +22990,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi835: +; NoVLX-NEXT: .Lcfi695: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi836: +; NoVLX-NEXT: .Lcfi696: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi837: +; NoVLX-NEXT: .Lcfi697: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23597,12 +23037,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi838: +; NoVLX-NEXT: .Lcfi698: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi839: +; NoVLX-NEXT: .Lcfi699: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi840: +; NoVLX-NEXT: .Lcfi700: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23610,7 +23050,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23623,7 +23062,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -23664,12 +23104,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi841: +; NoVLX-NEXT: .Lcfi701: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi842: +; NoVLX-NEXT: .Lcfi702: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi843: +; NoVLX-NEXT: .Lcfi703: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23677,7 +23117,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23690,7 +23129,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -23732,12 +23172,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi844: +; NoVLX-NEXT: .Lcfi704: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi845: +; NoVLX-NEXT: .Lcfi705: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi846: +; NoVLX-NEXT: .Lcfi706: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23781,12 +23221,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi847: +; NoVLX-NEXT: .Lcfi707: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi848: +; NoVLX-NEXT: .Lcfi708: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi849: +; NoVLX-NEXT: .Lcfi709: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -23795,7 +23235,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -23808,7 +23247,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -23851,12 +23291,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi850: +; NoVLX-NEXT: .Lcfi710: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi851: +; NoVLX-NEXT: .Lcfi711: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi852: +; NoVLX-NEXT: .Lcfi712: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23902,12 +23342,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi853: +; NoVLX-NEXT: .Lcfi713: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi854: +; NoVLX-NEXT: .Lcfi714: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi855: +; NoVLX-NEXT: .Lcfi715: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -23955,17 +23395,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi856: +; NoVLX-NEXT: .Lcfi716: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi857: +; NoVLX-NEXT: .Lcfi717: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi858: +; NoVLX-NEXT: .Lcfi718: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -23983,6 +23422,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -24028,17 +23468,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi859: +; NoVLX-NEXT: .Lcfi719: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi860: +; NoVLX-NEXT: .Lcfi720: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi861: +; NoVLX-NEXT: .Lcfi721: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -24056,6 +23495,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -24102,12 +23542,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi862: +; NoVLX-NEXT: .Lcfi722: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi863: +; NoVLX-NEXT: .Lcfi723: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi864: +; NoVLX-NEXT: .Lcfi724: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -24157,18 +23597,17 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi865: +; NoVLX-NEXT: .Lcfi725: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi866: +; NoVLX-NEXT: .Lcfi726: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi867: +; NoVLX-NEXT: .Lcfi727: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -24186,6 +23625,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -24397,12 +23837,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi868: +; NoVLX-NEXT: .Lcfi728: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi869: +; NoVLX-NEXT: .Lcfi729: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi870: +; NoVLX-NEXT: .Lcfi730: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24411,35 +23851,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24470,12 +23910,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi871: +; NoVLX-NEXT: .Lcfi731: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi872: +; NoVLX-NEXT: .Lcfi732: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi873: +; NoVLX-NEXT: .Lcfi733: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24484,35 +23924,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24545,12 +23985,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi874: +; NoVLX-NEXT: .Lcfi734: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi875: +; NoVLX-NEXT: .Lcfi735: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi876: +; NoVLX-NEXT: .Lcfi736: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24560,35 +24000,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24622,12 +24062,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi877: +; NoVLX-NEXT: .Lcfi737: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi878: +; NoVLX-NEXT: .Lcfi738: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi879: +; NoVLX-NEXT: .Lcfi739: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24637,35 +24077,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24700,12 +24140,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi880: +; NoVLX-NEXT: .Lcfi740: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi881: +; NoVLX-NEXT: .Lcfi741: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi882: +; NoVLX-NEXT: .Lcfi742: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24714,35 +24154,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24776,12 +24216,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi883: +; NoVLX-NEXT: .Lcfi743: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi884: +; NoVLX-NEXT: .Lcfi744: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi885: +; NoVLX-NEXT: .Lcfi745: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -24791,35 +24231,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -24855,12 +24295,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi886: +; NoVLX-NEXT: .Lcfi746: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi887: +; NoVLX-NEXT: .Lcfi747: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi888: +; NoVLX-NEXT: .Lcfi748: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -24933,12 +24373,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi889: +; NoVLX-NEXT: .Lcfi749: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi890: +; NoVLX-NEXT: .Lcfi750: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi891: +; NoVLX-NEXT: .Lcfi751: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25013,12 +24453,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi892: +; NoVLX-NEXT: .Lcfi752: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi893: +; NoVLX-NEXT: .Lcfi753: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi894: +; NoVLX-NEXT: .Lcfi754: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25095,12 +24535,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi895: +; NoVLX-NEXT: .Lcfi755: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi896: +; NoVLX-NEXT: .Lcfi756: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi897: +; NoVLX-NEXT: .Lcfi757: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25178,12 +24618,12 @@ ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi898: +; NoVLX-NEXT: .Lcfi758: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi899: +; NoVLX-NEXT: .Lcfi759: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi900: +; NoVLX-NEXT: .Lcfi760: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25259,12 +24699,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi901: +; NoVLX-NEXT: .Lcfi761: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi902: +; NoVLX-NEXT: .Lcfi762: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi903: +; NoVLX-NEXT: .Lcfi763: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -25342,12 +24782,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi904: +; NoVLX-NEXT: .Lcfi764: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi905: +; NoVLX-NEXT: .Lcfi765: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi906: +; NoVLX-NEXT: .Lcfi766: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25356,15 +24796,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi907: +; NoVLX-NEXT: .Lcfi767: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi908: +; NoVLX-NEXT: .Lcfi768: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi909: +; NoVLX-NEXT: .Lcfi769: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi910: +; NoVLX-NEXT: .Lcfi770: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi911: +; NoVLX-NEXT: .Lcfi771: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25432,9 +24872,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25469,12 +24909,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi912: +; NoVLX-NEXT: .Lcfi772: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi913: +; NoVLX-NEXT: .Lcfi773: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi914: +; NoVLX-NEXT: .Lcfi774: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25483,15 +24923,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi915: +; NoVLX-NEXT: .Lcfi775: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi916: +; NoVLX-NEXT: .Lcfi776: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi917: +; NoVLX-NEXT: .Lcfi777: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi918: +; NoVLX-NEXT: .Lcfi778: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi919: +; NoVLX-NEXT: .Lcfi779: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 @@ -25560,9 +25000,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25599,12 +25039,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi920: +; NoVLX-NEXT: .Lcfi780: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi921: +; NoVLX-NEXT: .Lcfi781: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi922: +; NoVLX-NEXT: .Lcfi782: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25613,15 +25053,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi923: +; NoVLX-NEXT: .Lcfi783: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi924: +; NoVLX-NEXT: .Lcfi784: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi925: +; NoVLX-NEXT: .Lcfi785: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi926: +; NoVLX-NEXT: .Lcfi786: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi927: +; NoVLX-NEXT: .Lcfi787: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -25690,9 +25130,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25730,12 +25170,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi928: +; NoVLX-NEXT: .Lcfi788: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi929: +; NoVLX-NEXT: .Lcfi789: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi930: +; NoVLX-NEXT: .Lcfi790: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -25744,15 +25184,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi931: +; NoVLX-NEXT: .Lcfi791: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi932: +; NoVLX-NEXT: .Lcfi792: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi933: +; NoVLX-NEXT: .Lcfi793: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi934: +; NoVLX-NEXT: .Lcfi794: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi935: +; NoVLX-NEXT: .Lcfi795: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 @@ -25822,9 +25262,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -25863,30 +25303,15 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi936: +; NoVLX-NEXT: .Lcfi796: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi937: +; NoVLX-NEXT: .Lcfi797: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi938: +; NoVLX-NEXT: .Lcfi798: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi939: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi940: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi941: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi942: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi943: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -25899,64 +25324,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -25968,12 +25393,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -25995,30 +25415,15 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi944: +; NoVLX-NEXT: .Lcfi799: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi945: +; NoVLX-NEXT: .Lcfi800: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi946: +; NoVLX-NEXT: .Lcfi801: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi947: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi948: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi949: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi950: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi951: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -26032,64 +25437,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -26101,12 +25506,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -26130,30 +25530,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi952: +; NoVLX-NEXT: .Lcfi802: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi953: +; NoVLX-NEXT: .Lcfi803: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi954: +; NoVLX-NEXT: .Lcfi804: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi955: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi956: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi957: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi958: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi959: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -26167,64 +25552,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -26236,12 +25621,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -26266,30 +25646,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi960: +; NoVLX-NEXT: .Lcfi805: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi961: +; NoVLX-NEXT: .Lcfi806: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi962: +; NoVLX-NEXT: .Lcfi807: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi963: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi964: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi965: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi966: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi967: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -26304,64 +25669,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -26373,12 +25738,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -26405,12 +25765,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi968: +; NoVLX-NEXT: .Lcfi808: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi969: +; NoVLX-NEXT: .Lcfi809: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi970: +; NoVLX-NEXT: .Lcfi810: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26456,12 +25816,12 @@ ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi971: +; NoVLX-NEXT: .Lcfi811: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi972: +; NoVLX-NEXT: .Lcfi812: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi973: +; NoVLX-NEXT: .Lcfi813: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -26510,12 +25870,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi974: +; NoVLX-NEXT: .Lcfi814: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi975: +; NoVLX-NEXT: .Lcfi815: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi976: +; NoVLX-NEXT: .Lcfi816: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -26573,12 +25933,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi977: +; NoVLX-NEXT: .Lcfi817: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi978: +; NoVLX-NEXT: .Lcfi818: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi979: +; NoVLX-NEXT: .Lcfi819: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -26770,12 +26130,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi980: +; NoVLX-NEXT: .Lcfi820: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi981: +; NoVLX-NEXT: .Lcfi821: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi982: +; NoVLX-NEXT: .Lcfi822: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26789,35 +26149,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26847,12 +26207,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi983: +; NoVLX-NEXT: .Lcfi823: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi984: +; NoVLX-NEXT: .Lcfi824: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi985: +; NoVLX-NEXT: .Lcfi825: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26867,35 +26227,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -26927,12 +26287,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi986: +; NoVLX-NEXT: .Lcfi826: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi987: +; NoVLX-NEXT: .Lcfi827: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi988: +; NoVLX-NEXT: .Lcfi828: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -26947,35 +26307,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27008,12 +26368,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi989: +; NoVLX-NEXT: .Lcfi829: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi990: +; NoVLX-NEXT: .Lcfi830: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi991: +; NoVLX-NEXT: .Lcfi831: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -27029,35 +26389,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27091,12 +26451,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi992: +; NoVLX-NEXT: .Lcfi832: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi993: +; NoVLX-NEXT: .Lcfi833: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi994: +; NoVLX-NEXT: .Lcfi834: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -27173,12 +26533,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi995: +; NoVLX-NEXT: .Lcfi835: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi996: +; NoVLX-NEXT: .Lcfi836: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi997: +; NoVLX-NEXT: .Lcfi837: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -27258,12 +26618,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi998: +; NoVLX-NEXT: .Lcfi838: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi999: +; NoVLX-NEXT: .Lcfi839: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1000: +; NoVLX-NEXT: .Lcfi840: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -27344,12 +26704,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1001: +; NoVLX-NEXT: .Lcfi841: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1002: +; NoVLX-NEXT: .Lcfi842: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1003: +; NoVLX-NEXT: .Lcfi843: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -27433,12 +26793,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1004: +; NoVLX-NEXT: .Lcfi844: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1005: +; NoVLX-NEXT: .Lcfi845: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1006: +; NoVLX-NEXT: .Lcfi846: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27447,15 +26807,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1007: +; NoVLX-NEXT: .Lcfi847: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1008: +; NoVLX-NEXT: .Lcfi848: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1009: +; NoVLX-NEXT: .Lcfi849: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1010: +; NoVLX-NEXT: .Lcfi850: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1011: +; NoVLX-NEXT: .Lcfi851: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27523,9 +26883,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27561,12 +26921,12 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1012: +; NoVLX-NEXT: .Lcfi852: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1013: +; NoVLX-NEXT: .Lcfi853: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1014: +; NoVLX-NEXT: .Lcfi854: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27575,15 +26935,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1015: +; NoVLX-NEXT: .Lcfi855: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1016: +; NoVLX-NEXT: .Lcfi856: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1017: +; NoVLX-NEXT: .Lcfi857: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1018: +; NoVLX-NEXT: .Lcfi858: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1019: +; NoVLX-NEXT: .Lcfi859: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 @@ -27652,9 +27012,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27692,12 +27052,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1020: +; NoVLX-NEXT: .Lcfi860: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1021: +; NoVLX-NEXT: .Lcfi861: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1022: +; NoVLX-NEXT: .Lcfi862: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27706,15 +27066,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1023: +; NoVLX-NEXT: .Lcfi863: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1024: +; NoVLX-NEXT: .Lcfi864: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1025: +; NoVLX-NEXT: .Lcfi865: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1026: +; NoVLX-NEXT: .Lcfi866: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1027: +; NoVLX-NEXT: .Lcfi867: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -27783,9 +27143,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27824,12 +27184,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1028: +; NoVLX-NEXT: .Lcfi868: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1029: +; NoVLX-NEXT: .Lcfi869: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1030: +; NoVLX-NEXT: .Lcfi870: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -27838,15 +27198,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1031: +; NoVLX-NEXT: .Lcfi871: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1032: +; NoVLX-NEXT: .Lcfi872: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1033: +; NoVLX-NEXT: .Lcfi873: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1034: +; NoVLX-NEXT: .Lcfi874: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1035: +; NoVLX-NEXT: .Lcfi875: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 @@ -27916,9 +27276,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -27958,30 +27318,15 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1036: +; NoVLX-NEXT: .Lcfi876: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1037: +; NoVLX-NEXT: .Lcfi877: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1038: +; NoVLX-NEXT: .Lcfi878: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1039: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1040: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1041: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1042: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1043: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -27994,64 +27339,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -28063,12 +27408,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -28091,30 +27431,15 @@ ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1044: +; NoVLX-NEXT: .Lcfi879: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1045: +; NoVLX-NEXT: .Lcfi880: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1046: +; NoVLX-NEXT: .Lcfi881: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp -; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1047: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1048: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1049: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1050: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1051: -; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -28128,64 +27453,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -28197,12 +27522,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -28227,30 +27547,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1052: +; NoVLX-NEXT: .Lcfi882: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1053: +; NoVLX-NEXT: .Lcfi883: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1054: +; NoVLX-NEXT: .Lcfi884: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1055: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1056: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1057: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1058: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1059: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -28264,64 +27569,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -28333,12 +27638,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -28364,30 +27664,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1060: +; NoVLX-NEXT: .Lcfi885: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1061: +; NoVLX-NEXT: .Lcfi886: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1062: +; NoVLX-NEXT: .Lcfi887: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1063: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1064: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1065: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1066: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1067: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 @@ -28402,64 +27687,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -28471,12 +27756,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -28503,62 +27783,58 @@ ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1068: +; NoVLX-NEXT: .Lcfi888: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1069: +; NoVLX-NEXT: .Lcfi889: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1070: +; NoVLX-NEXT: .Lcfi890: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm5 @@ -28566,263 +27842,267 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 ; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm8 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rax +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm3 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpxor %ymm1, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 -; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -28857,69 +28137,68 @@ ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1071: +; NoVLX-NEXT: .Lcfi891: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1072: +; NoVLX-NEXT: .Lcfi892: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1073: +; NoVLX-NEXT: .Lcfi893: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -28927,7 +28206,8 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax @@ -28937,97 +28217,95 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; NoVLX-NEXT: vmovdqa (%rdi), %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm0, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -29091,9 +28369,11 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -29130,12 +28410,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1074: +; NoVLX-NEXT: .Lcfi894: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1075: +; NoVLX-NEXT: .Lcfi895: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1076: +; NoVLX-NEXT: .Lcfi896: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -29146,17 +28426,12 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -29164,9 +28439,10 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm8 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm4 @@ -29184,39 +28460,40 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 @@ -29224,69 +28501,52 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm7, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm5 ; NoVLX-NEXT: vmovq %xmm5, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm6 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm6, %xmm5 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 +; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm7, %xmm7 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm7, %xmm7 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm6 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm6, %xmm6 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm6, %xmm6 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm6, %xmm6 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm2 @@ -29294,17 +28554,7 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm3 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx @@ -29316,7 +28566,37 @@ ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vmovd %eax, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $2, %ecx, %xmm7, %xmm7 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm3 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm2, %ymm2 ; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 ; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 @@ -29496,12 +28776,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1077: +; NoVLX-NEXT: .Lcfi897: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1078: +; NoVLX-NEXT: .Lcfi898: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1079: +; NoVLX-NEXT: .Lcfi899: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -29513,8 +28793,6 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -29527,19 +28805,20 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 @@ -29547,6 +28826,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 @@ -29784,8 +29064,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -29842,8 +29122,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -29899,7 +29179,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -29912,13 +29191,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -29977,7 +29257,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -29990,13 +29269,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -30059,8 +29339,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -30119,7 +29399,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30132,13 +29411,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -30200,8 +29480,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -30257,8 +29537,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -30313,7 +29593,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30326,13 +29605,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -30390,7 +29670,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30403,13 +29682,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -30471,8 +29751,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -30530,7 +29810,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30543,13 +29822,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -30605,12 +29885,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1080: +; NoVLX-NEXT: .Lcfi900: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1081: +; NoVLX-NEXT: .Lcfi901: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1082: +; NoVLX-NEXT: .Lcfi902: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30650,12 +29930,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1083: +; NoVLX-NEXT: .Lcfi903: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1084: +; NoVLX-NEXT: .Lcfi904: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1085: +; NoVLX-NEXT: .Lcfi905: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30698,19 +29978,18 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1086: +; NoVLX-NEXT: .Lcfi906: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1087: +; NoVLX-NEXT: .Lcfi907: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1088: +; NoVLX-NEXT: .Lcfi908: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30723,7 +30002,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -30763,12 +30043,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1089: +; NoVLX-NEXT: .Lcfi909: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1090: +; NoVLX-NEXT: .Lcfi910: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1091: +; NoVLX-NEXT: .Lcfi911: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30776,7 +30056,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30789,7 +30068,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -30831,12 +30111,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1092: +; NoVLX-NEXT: .Lcfi912: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1093: +; NoVLX-NEXT: .Lcfi913: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1094: +; NoVLX-NEXT: .Lcfi914: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30881,12 +30161,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1095: +; NoVLX-NEXT: .Lcfi915: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1096: +; NoVLX-NEXT: .Lcfi916: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1097: +; NoVLX-NEXT: .Lcfi917: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -30894,7 +30174,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -30907,7 +30186,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -30949,12 +30229,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1098: +; NoVLX-NEXT: .Lcfi918: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1099: +; NoVLX-NEXT: .Lcfi919: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1100: +; NoVLX-NEXT: .Lcfi920: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31000,12 +30280,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1101: +; NoVLX-NEXT: .Lcfi921: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1102: +; NoVLX-NEXT: .Lcfi922: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1103: +; NoVLX-NEXT: .Lcfi923: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31054,16 +30334,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1104: +; NoVLX-NEXT: .Lcfi924: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1105: +; NoVLX-NEXT: .Lcfi925: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1106: +; NoVLX-NEXT: .Lcfi926: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -31075,13 +30354,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -31125,17 +30405,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1107: +; NoVLX-NEXT: .Lcfi927: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1108: +; NoVLX-NEXT: .Lcfi928: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1109: +; NoVLX-NEXT: .Lcfi929: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -31147,13 +30426,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -31199,12 +30479,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1110: +; NoVLX-NEXT: .Lcfi930: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1111: +; NoVLX-NEXT: .Lcfi931: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1112: +; NoVLX-NEXT: .Lcfi932: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -31255,17 +30535,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1113: +; NoVLX-NEXT: .Lcfi933: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1114: +; NoVLX-NEXT: .Lcfi934: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1115: +; NoVLX-NEXT: .Lcfi935: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -31277,13 +30556,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -31520,12 +30800,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1116: +; NoVLX-NEXT: .Lcfi936: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1117: +; NoVLX-NEXT: .Lcfi937: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1118: +; NoVLX-NEXT: .Lcfi938: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31536,35 +30816,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31595,12 +30875,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1119: +; NoVLX-NEXT: .Lcfi939: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1120: +; NoVLX-NEXT: .Lcfi940: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1121: +; NoVLX-NEXT: .Lcfi941: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31611,35 +30891,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31672,12 +30952,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1122: +; NoVLX-NEXT: .Lcfi942: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1123: +; NoVLX-NEXT: .Lcfi943: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1124: +; NoVLX-NEXT: .Lcfi944: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31690,35 +30970,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31752,12 +31032,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1125: +; NoVLX-NEXT: .Lcfi945: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1126: +; NoVLX-NEXT: .Lcfi946: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1127: +; NoVLX-NEXT: .Lcfi947: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31770,35 +31050,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31834,12 +31114,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1128: +; NoVLX-NEXT: .Lcfi948: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1129: +; NoVLX-NEXT: .Lcfi949: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1130: +; NoVLX-NEXT: .Lcfi950: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31850,35 +31130,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31913,12 +31193,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1131: +; NoVLX-NEXT: .Lcfi951: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1132: +; NoVLX-NEXT: .Lcfi952: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1133: +; NoVLX-NEXT: .Lcfi953: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -31931,35 +31211,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -31995,12 +31275,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1134: +; NoVLX-NEXT: .Lcfi954: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1135: +; NoVLX-NEXT: .Lcfi955: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1136: +; NoVLX-NEXT: .Lcfi956: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -32075,12 +31355,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1137: +; NoVLX-NEXT: .Lcfi957: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1138: +; NoVLX-NEXT: .Lcfi958: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1139: +; NoVLX-NEXT: .Lcfi959: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -32157,12 +31437,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1140: +; NoVLX-NEXT: .Lcfi960: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1141: +; NoVLX-NEXT: .Lcfi961: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1142: +; NoVLX-NEXT: .Lcfi962: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -32242,12 +31522,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1143: +; NoVLX-NEXT: .Lcfi963: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1144: +; NoVLX-NEXT: .Lcfi964: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1145: +; NoVLX-NEXT: .Lcfi965: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -32329,12 +31609,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1146: +; NoVLX-NEXT: .Lcfi966: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1147: +; NoVLX-NEXT: .Lcfi967: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1148: +; NoVLX-NEXT: .Lcfi968: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -32413,12 +31693,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1149: +; NoVLX-NEXT: .Lcfi969: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1150: +; NoVLX-NEXT: .Lcfi970: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1151: +; NoVLX-NEXT: .Lcfi971: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -32500,12 +31780,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1152: +; NoVLX-NEXT: .Lcfi972: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1153: +; NoVLX-NEXT: .Lcfi973: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1154: +; NoVLX-NEXT: .Lcfi974: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32514,15 +31794,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1155: +; NoVLX-NEXT: .Lcfi975: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1156: +; NoVLX-NEXT: .Lcfi976: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1157: +; NoVLX-NEXT: .Lcfi977: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1158: +; NoVLX-NEXT: .Lcfi978: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1159: +; NoVLX-NEXT: .Lcfi979: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -32585,9 +31865,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -32623,12 +31903,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1160: +; NoVLX-NEXT: .Lcfi980: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1161: +; NoVLX-NEXT: .Lcfi981: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1162: +; NoVLX-NEXT: .Lcfi982: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32637,15 +31917,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1163: +; NoVLX-NEXT: .Lcfi983: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1164: +; NoVLX-NEXT: .Lcfi984: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1165: +; NoVLX-NEXT: .Lcfi985: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1166: +; NoVLX-NEXT: .Lcfi986: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1167: +; NoVLX-NEXT: .Lcfi987: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -32708,9 +31988,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -32748,12 +32028,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1168: +; NoVLX-NEXT: .Lcfi988: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1169: +; NoVLX-NEXT: .Lcfi989: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1170: +; NoVLX-NEXT: .Lcfi990: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32762,15 +32042,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1171: +; NoVLX-NEXT: .Lcfi991: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1172: +; NoVLX-NEXT: .Lcfi992: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1173: +; NoVLX-NEXT: .Lcfi993: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1174: +; NoVLX-NEXT: .Lcfi994: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1175: +; NoVLX-NEXT: .Lcfi995: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} @@ -32834,9 +32114,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -32875,12 +32155,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1176: +; NoVLX-NEXT: .Lcfi996: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1177: +; NoVLX-NEXT: .Lcfi997: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1178: +; NoVLX-NEXT: .Lcfi998: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -32889,15 +32169,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1179: +; NoVLX-NEXT: .Lcfi999: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1180: +; NoVLX-NEXT: .Lcfi1000: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1181: +; NoVLX-NEXT: .Lcfi1001: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1182: +; NoVLX-NEXT: .Lcfi1002: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1183: +; NoVLX-NEXT: .Lcfi1003: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} @@ -32961,9 +32241,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -33004,12 +32284,12 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1184: +; NoVLX-NEXT: .Lcfi1004: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1185: +; NoVLX-NEXT: .Lcfi1005: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1186: +; NoVLX-NEXT: .Lcfi1006: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -33018,15 +32298,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1187: +; NoVLX-NEXT: .Lcfi1007: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1188: +; NoVLX-NEXT: .Lcfi1008: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1189: +; NoVLX-NEXT: .Lcfi1009: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1190: +; NoVLX-NEXT: .Lcfi1010: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1191: +; NoVLX-NEXT: .Lcfi1011: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 @@ -33090,9 +32370,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -33132,12 +32412,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1192: +; NoVLX-NEXT: .Lcfi1012: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1193: +; NoVLX-NEXT: .Lcfi1013: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1194: +; NoVLX-NEXT: .Lcfi1014: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -33146,15 +32426,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1195: +; NoVLX-NEXT: .Lcfi1015: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1196: +; NoVLX-NEXT: .Lcfi1016: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1197: +; NoVLX-NEXT: .Lcfi1017: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1198: +; NoVLX-NEXT: .Lcfi1018: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1199: +; NoVLX-NEXT: .Lcfi1019: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 @@ -33219,9 +32499,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -33262,30 +32542,15 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1200: +; NoVLX-NEXT: .Lcfi1020: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1201: +; NoVLX-NEXT: .Lcfi1021: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1202: +; NoVLX-NEXT: .Lcfi1022: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1203: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1204: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1205: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1206: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1207: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -33293,64 +32558,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -33362,12 +32627,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -33390,30 +32650,15 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1208: +; NoVLX-NEXT: .Lcfi1023: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1209: +; NoVLX-NEXT: .Lcfi1024: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1210: +; NoVLX-NEXT: .Lcfi1025: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1211: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1212: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1213: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1214: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1215: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -33421,64 +32666,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -33490,12 +32735,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -33520,30 +32760,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1216: +; NoVLX-NEXT: .Lcfi1026: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1217: +; NoVLX-NEXT: .Lcfi1027: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1218: +; NoVLX-NEXT: .Lcfi1028: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1219: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1220: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1221: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1222: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1223: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -33552,64 +32777,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -33621,12 +32846,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -33652,30 +32872,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1224: +; NoVLX-NEXT: .Lcfi1029: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1225: +; NoVLX-NEXT: .Lcfi1030: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1226: +; NoVLX-NEXT: .Lcfi1031: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1227: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1228: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1229: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1230: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1231: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -33684,64 +32889,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -33753,12 +32958,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -33786,30 +32986,15 @@ ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1232: +; NoVLX-NEXT: .Lcfi1032: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1233: +; NoVLX-NEXT: .Lcfi1033: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1234: +; NoVLX-NEXT: .Lcfi1034: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1235: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1236: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1237: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1238: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1239: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -33818,64 +33003,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -33887,12 +33072,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -33919,30 +33099,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1240: +; NoVLX-NEXT: .Lcfi1035: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1241: +; NoVLX-NEXT: .Lcfi1036: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1242: +; NoVLX-NEXT: .Lcfi1037: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1243: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1244: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1245: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1246: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1247: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} @@ -33952,64 +33117,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -34021,12 +33186,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -34114,7 +33274,6 @@ ; ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: ; NoVLX: # BB#0: # %entry -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34122,9 +33281,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -34156,7 +33316,6 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34164,9 +33323,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -34234,7 +33394,6 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -34242,9 +33401,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -34282,8 +33442,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34291,9 +33451,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34324,8 +33484,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34333,9 +33493,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34375,8 +33535,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34384,9 +33544,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34429,8 +33589,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34438,9 +33598,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34477,8 +33637,8 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34486,9 +33646,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34531,8 +33691,8 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -34540,9 +33700,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34578,17 +33738,17 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34619,17 +33779,17 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34669,17 +33829,17 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34722,17 +33882,17 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34769,17 +33929,17 @@ ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34822,17 +33982,17 @@ ; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -34863,12 +34023,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1248: +; NoVLX-NEXT: .Lcfi1038: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1249: +; NoVLX-NEXT: .Lcfi1039: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1250: +; NoVLX-NEXT: .Lcfi1040: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34908,12 +34068,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1251: +; NoVLX-NEXT: .Lcfi1041: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1252: +; NoVLX-NEXT: .Lcfi1042: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1253: +; NoVLX-NEXT: .Lcfi1043: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -34956,12 +34116,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1254: +; NoVLX-NEXT: .Lcfi1044: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1255: +; NoVLX-NEXT: .Lcfi1045: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1256: +; NoVLX-NEXT: .Lcfi1046: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35013,12 +34173,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1257: +; NoVLX-NEXT: .Lcfi1047: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1258: +; NoVLX-NEXT: .Lcfi1048: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1259: +; NoVLX-NEXT: .Lcfi1049: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35073,12 +34233,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1260: +; NoVLX-NEXT: .Lcfi1050: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1261: +; NoVLX-NEXT: .Lcfi1051: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1262: +; NoVLX-NEXT: .Lcfi1052: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35123,12 +34283,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1263: +; NoVLX-NEXT: .Lcfi1053: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1264: +; NoVLX-NEXT: .Lcfi1054: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1265: +; NoVLX-NEXT: .Lcfi1055: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -35183,12 +34343,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1266: +; NoVLX-NEXT: .Lcfi1056: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1267: +; NoVLX-NEXT: .Lcfi1057: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1268: +; NoVLX-NEXT: .Lcfi1058: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -35234,12 +34394,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1269: +; NoVLX-NEXT: .Lcfi1059: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1270: +; NoVLX-NEXT: .Lcfi1060: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1271: +; NoVLX-NEXT: .Lcfi1061: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -35288,16 +34448,15 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1272: +; NoVLX-NEXT: .Lcfi1062: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1273: +; NoVLX-NEXT: .Lcfi1063: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1274: +; NoVLX-NEXT: .Lcfi1064: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35305,9 +34464,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -35351,17 +34511,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1275: +; NoVLX-NEXT: .Lcfi1065: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1276: +; NoVLX-NEXT: .Lcfi1066: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1277: +; NoVLX-NEXT: .Lcfi1067: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35369,9 +34528,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -35417,12 +34577,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1278: +; NoVLX-NEXT: .Lcfi1068: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1279: +; NoVLX-NEXT: .Lcfi1069: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1280: +; NoVLX-NEXT: .Lcfi1070: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -35473,17 +34633,16 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1281: +; NoVLX-NEXT: .Lcfi1071: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1282: +; NoVLX-NEXT: .Lcfi1072: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1283: +; NoVLX-NEXT: .Lcfi1073: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -35491,9 +34650,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -35546,8 +34706,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35606,8 +34766,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35667,7 +34827,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -35680,13 +34839,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35749,7 +34909,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -35762,13 +34921,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35833,8 +34993,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35897,7 +35057,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -35910,13 +35069,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -35980,8 +35140,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -36039,8 +35199,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -36099,7 +35259,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36112,13 +35271,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -36180,7 +35340,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36193,13 +35352,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -36263,8 +35423,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -36326,7 +35486,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36339,13 +35498,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -36402,12 +35562,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1284: +; NoVLX-NEXT: .Lcfi1074: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1285: +; NoVLX-NEXT: .Lcfi1075: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1286: +; NoVLX-NEXT: .Lcfi1076: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36449,12 +35609,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1287: +; NoVLX-NEXT: .Lcfi1077: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1288: +; NoVLX-NEXT: .Lcfi1078: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1289: +; NoVLX-NEXT: .Lcfi1079: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36499,12 +35659,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1290: +; NoVLX-NEXT: .Lcfi1080: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1291: +; NoVLX-NEXT: .Lcfi1081: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1292: +; NoVLX-NEXT: .Lcfi1082: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36514,7 +35674,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36527,7 +35686,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -36568,12 +35728,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1293: +; NoVLX-NEXT: .Lcfi1083: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1294: +; NoVLX-NEXT: .Lcfi1084: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1295: +; NoVLX-NEXT: .Lcfi1085: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36584,7 +35744,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36597,7 +35756,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -36640,12 +35800,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1296: +; NoVLX-NEXT: .Lcfi1086: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1297: +; NoVLX-NEXT: .Lcfi1087: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1298: +; NoVLX-NEXT: .Lcfi1088: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36692,12 +35852,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1299: +; NoVLX-NEXT: .Lcfi1089: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1300: +; NoVLX-NEXT: .Lcfi1090: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1301: +; NoVLX-NEXT: .Lcfi1091: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -36708,7 +35868,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -36721,7 +35880,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -36764,12 +35924,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1302: +; NoVLX-NEXT: .Lcfi1092: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1303: +; NoVLX-NEXT: .Lcfi1093: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1304: +; NoVLX-NEXT: .Lcfi1094: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36817,12 +35977,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1305: +; NoVLX-NEXT: .Lcfi1095: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1306: +; NoVLX-NEXT: .Lcfi1096: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1307: +; NoVLX-NEXT: .Lcfi1097: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36873,19 +36033,18 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1308: +; NoVLX-NEXT: .Lcfi1098: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1309: +; NoVLX-NEXT: .Lcfi1099: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1310: +; NoVLX-NEXT: .Lcfi1100: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -36903,6 +36062,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -36948,12 +36108,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1311: +; NoVLX-NEXT: .Lcfi1101: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1312: +; NoVLX-NEXT: .Lcfi1102: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1313: +; NoVLX-NEXT: .Lcfi1103: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -36961,7 +36121,6 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -36979,6 +36138,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -37026,12 +36186,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1314: +; NoVLX-NEXT: .Lcfi1104: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1315: +; NoVLX-NEXT: .Lcfi1105: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1316: +; NoVLX-NEXT: .Lcfi1106: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -37084,12 +36244,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1317: +; NoVLX-NEXT: .Lcfi1107: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1318: +; NoVLX-NEXT: .Lcfi1108: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1319: +; NoVLX-NEXT: .Lcfi1109: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -37097,7 +36257,6 @@ ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -37115,6 +36274,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -37330,12 +36490,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1320: +; NoVLX-NEXT: .Lcfi1110: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1321: +; NoVLX-NEXT: .Lcfi1111: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1322: +; NoVLX-NEXT: .Lcfi1112: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -37344,35 +36504,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37403,12 +36563,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1323: +; NoVLX-NEXT: .Lcfi1113: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1324: +; NoVLX-NEXT: .Lcfi1114: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1325: +; NoVLX-NEXT: .Lcfi1115: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -37417,35 +36577,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37478,12 +36638,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1326: +; NoVLX-NEXT: .Lcfi1116: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1327: +; NoVLX-NEXT: .Lcfi1117: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1328: +; NoVLX-NEXT: .Lcfi1118: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -37493,35 +36653,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37555,12 +36715,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1329: +; NoVLX-NEXT: .Lcfi1119: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1330: +; NoVLX-NEXT: .Lcfi1120: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1331: +; NoVLX-NEXT: .Lcfi1121: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -37570,35 +36730,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37634,12 +36794,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1332: +; NoVLX-NEXT: .Lcfi1122: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1333: +; NoVLX-NEXT: .Lcfi1123: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1334: +; NoVLX-NEXT: .Lcfi1124: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -37649,35 +36809,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37712,12 +36872,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1335: +; NoVLX-NEXT: .Lcfi1125: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1336: +; NoVLX-NEXT: .Lcfi1126: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1337: +; NoVLX-NEXT: .Lcfi1127: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -37728,35 +36888,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -37792,12 +36952,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1338: +; NoVLX-NEXT: .Lcfi1128: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1339: +; NoVLX-NEXT: .Lcfi1129: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1340: +; NoVLX-NEXT: .Lcfi1130: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -37870,12 +37030,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1341: +; NoVLX-NEXT: .Lcfi1131: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1342: +; NoVLX-NEXT: .Lcfi1132: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1343: +; NoVLX-NEXT: .Lcfi1133: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -37950,12 +37110,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1344: +; NoVLX-NEXT: .Lcfi1134: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1345: +; NoVLX-NEXT: .Lcfi1135: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1346: +; NoVLX-NEXT: .Lcfi1136: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -38032,12 +37192,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1347: +; NoVLX-NEXT: .Lcfi1137: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1348: +; NoVLX-NEXT: .Lcfi1138: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1349: +; NoVLX-NEXT: .Lcfi1139: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -38116,12 +37276,12 @@ ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1350: +; NoVLX-NEXT: .Lcfi1140: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1351: +; NoVLX-NEXT: .Lcfi1141: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1352: +; NoVLX-NEXT: .Lcfi1142: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -38199,12 +37359,12 @@ ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1353: +; NoVLX-NEXT: .Lcfi1143: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1354: +; NoVLX-NEXT: .Lcfi1144: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1355: +; NoVLX-NEXT: .Lcfi1145: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -38283,12 +37443,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1356: +; NoVLX-NEXT: .Lcfi1146: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1357: +; NoVLX-NEXT: .Lcfi1147: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1358: +; NoVLX-NEXT: .Lcfi1148: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38297,15 +37457,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1359: +; NoVLX-NEXT: .Lcfi1149: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1360: +; NoVLX-NEXT: .Lcfi1150: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1361: +; NoVLX-NEXT: .Lcfi1151: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1362: +; NoVLX-NEXT: .Lcfi1152: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1363: +; NoVLX-NEXT: .Lcfi1153: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -38374,9 +37534,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -38411,12 +37571,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1364: +; NoVLX-NEXT: .Lcfi1154: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1365: +; NoVLX-NEXT: .Lcfi1155: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1366: +; NoVLX-NEXT: .Lcfi1156: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38425,15 +37585,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1367: +; NoVLX-NEXT: .Lcfi1157: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1368: +; NoVLX-NEXT: .Lcfi1158: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1369: +; NoVLX-NEXT: .Lcfi1159: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1370: +; NoVLX-NEXT: .Lcfi1160: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1371: +; NoVLX-NEXT: .Lcfi1161: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -38502,9 +37662,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -38541,12 +37701,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1372: +; NoVLX-NEXT: .Lcfi1162: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1373: +; NoVLX-NEXT: .Lcfi1163: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1374: +; NoVLX-NEXT: .Lcfi1164: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38555,15 +37715,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1375: +; NoVLX-NEXT: .Lcfi1165: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1376: +; NoVLX-NEXT: .Lcfi1166: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1377: +; NoVLX-NEXT: .Lcfi1167: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1378: +; NoVLX-NEXT: .Lcfi1168: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1379: +; NoVLX-NEXT: .Lcfi1169: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -38633,9 +37793,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -38673,12 +37833,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1380: +; NoVLX-NEXT: .Lcfi1170: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1381: +; NoVLX-NEXT: .Lcfi1171: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1382: +; NoVLX-NEXT: .Lcfi1172: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -38687,15 +37847,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1383: +; NoVLX-NEXT: .Lcfi1173: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1384: +; NoVLX-NEXT: .Lcfi1174: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1385: +; NoVLX-NEXT: .Lcfi1175: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1386: +; NoVLX-NEXT: .Lcfi1176: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1387: +; NoVLX-NEXT: .Lcfi1177: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -38765,9 +37925,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -38806,30 +37966,15 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1388: +; NoVLX-NEXT: .Lcfi1178: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1389: +; NoVLX-NEXT: .Lcfi1179: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1390: +; NoVLX-NEXT: .Lcfi1180: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1391: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1392: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1393: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1394: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1395: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 @@ -38843,64 +37988,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -38912,12 +38057,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -38939,30 +38079,15 @@ ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1396: +; NoVLX-NEXT: .Lcfi1181: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1397: +; NoVLX-NEXT: .Lcfi1182: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1398: +; NoVLX-NEXT: .Lcfi1183: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1399: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1400: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1401: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1402: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1403: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 @@ -38976,64 +38101,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -39045,12 +38170,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -39074,30 +38194,15 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1404: +; NoVLX-NEXT: .Lcfi1184: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1405: +; NoVLX-NEXT: .Lcfi1185: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1406: +; NoVLX-NEXT: .Lcfi1186: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1407: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1408: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1409: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1410: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1411: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 @@ -39112,64 +38217,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -39181,12 +38286,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -39211,30 +38311,15 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1412: +; NoVLX-NEXT: .Lcfi1187: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1413: +; NoVLX-NEXT: .Lcfi1188: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1414: +; NoVLX-NEXT: .Lcfi1189: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1415: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1416: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1417: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1418: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1419: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 @@ -39249,64 +38334,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -39318,12 +38403,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -39350,12 +38430,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1420: +; NoVLX-NEXT: .Lcfi1190: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1421: +; NoVLX-NEXT: .Lcfi1191: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1422: +; NoVLX-NEXT: .Lcfi1192: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39402,12 +38482,12 @@ ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1423: +; NoVLX-NEXT: .Lcfi1193: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1424: +; NoVLX-NEXT: .Lcfi1194: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1425: +; NoVLX-NEXT: .Lcfi1195: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -39456,12 +38536,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1426: +; NoVLX-NEXT: .Lcfi1196: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1427: +; NoVLX-NEXT: .Lcfi1197: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1428: +; NoVLX-NEXT: .Lcfi1198: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -39520,12 +38600,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1429: +; NoVLX-NEXT: .Lcfi1199: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1430: +; NoVLX-NEXT: .Lcfi1200: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1431: +; NoVLX-NEXT: .Lcfi1201: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -39535,10 +38615,10 @@ ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} -; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm3, %ymm0 ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2 @@ -39719,12 +38799,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1432: +; NoVLX-NEXT: .Lcfi1202: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1433: +; NoVLX-NEXT: .Lcfi1203: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1434: +; NoVLX-NEXT: .Lcfi1204: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -39739,35 +38819,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -39797,12 +38877,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1435: +; NoVLX-NEXT: .Lcfi1205: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1436: +; NoVLX-NEXT: .Lcfi1206: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1437: +; NoVLX-NEXT: .Lcfi1207: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -39817,35 +38897,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -39877,12 +38957,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1438: +; NoVLX-NEXT: .Lcfi1208: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1439: +; NoVLX-NEXT: .Lcfi1209: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1440: +; NoVLX-NEXT: .Lcfi1210: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -39898,35 +38978,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -39959,12 +39039,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1441: +; NoVLX-NEXT: .Lcfi1211: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1442: +; NoVLX-NEXT: .Lcfi1212: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1443: +; NoVLX-NEXT: .Lcfi1213: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -39980,35 +39060,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -40042,12 +39122,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1444: +; NoVLX-NEXT: .Lcfi1214: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1445: +; NoVLX-NEXT: .Lcfi1215: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1446: +; NoVLX-NEXT: .Lcfi1216: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -40125,12 +39205,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1447: +; NoVLX-NEXT: .Lcfi1217: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1448: +; NoVLX-NEXT: .Lcfi1218: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1449: +; NoVLX-NEXT: .Lcfi1219: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -40210,12 +39290,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1450: +; NoVLX-NEXT: .Lcfi1220: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1451: +; NoVLX-NEXT: .Lcfi1221: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1452: +; NoVLX-NEXT: .Lcfi1222: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -40297,12 +39377,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1453: +; NoVLX-NEXT: .Lcfi1223: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1454: +; NoVLX-NEXT: .Lcfi1224: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1455: +; NoVLX-NEXT: .Lcfi1225: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -40386,12 +39466,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1456: +; NoVLX-NEXT: .Lcfi1226: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1457: +; NoVLX-NEXT: .Lcfi1227: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1458: +; NoVLX-NEXT: .Lcfi1228: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40400,15 +39480,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1459: +; NoVLX-NEXT: .Lcfi1229: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1460: +; NoVLX-NEXT: .Lcfi1230: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1461: +; NoVLX-NEXT: .Lcfi1231: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1462: +; NoVLX-NEXT: .Lcfi1232: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1463: +; NoVLX-NEXT: .Lcfi1233: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -40477,9 +39557,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -40515,12 +39595,12 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1464: +; NoVLX-NEXT: .Lcfi1234: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1465: +; NoVLX-NEXT: .Lcfi1235: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1466: +; NoVLX-NEXT: .Lcfi1236: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40529,15 +39609,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1467: +; NoVLX-NEXT: .Lcfi1237: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1468: +; NoVLX-NEXT: .Lcfi1238: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1469: +; NoVLX-NEXT: .Lcfi1239: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1470: +; NoVLX-NEXT: .Lcfi1240: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1471: +; NoVLX-NEXT: .Lcfi1241: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -40606,9 +39686,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -40646,12 +39726,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1472: +; NoVLX-NEXT: .Lcfi1242: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1473: +; NoVLX-NEXT: .Lcfi1243: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1474: +; NoVLX-NEXT: .Lcfi1244: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40660,15 +39740,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1475: +; NoVLX-NEXT: .Lcfi1245: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1476: +; NoVLX-NEXT: .Lcfi1246: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1477: +; NoVLX-NEXT: .Lcfi1247: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1478: +; NoVLX-NEXT: .Lcfi1248: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1479: +; NoVLX-NEXT: .Lcfi1249: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -40738,9 +39818,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -40779,12 +39859,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1480: +; NoVLX-NEXT: .Lcfi1250: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1481: +; NoVLX-NEXT: .Lcfi1251: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1482: +; NoVLX-NEXT: .Lcfi1252: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -40793,15 +39873,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1483: +; NoVLX-NEXT: .Lcfi1253: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1484: +; NoVLX-NEXT: .Lcfi1254: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1485: +; NoVLX-NEXT: .Lcfi1255: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1486: +; NoVLX-NEXT: .Lcfi1256: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1487: +; NoVLX-NEXT: .Lcfi1257: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 @@ -40871,9 +39951,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -40913,30 +39993,15 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1488: +; NoVLX-NEXT: .Lcfi1258: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1489: +; NoVLX-NEXT: .Lcfi1259: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1490: +; NoVLX-NEXT: .Lcfi1260: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1491: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1492: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1493: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1494: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1495: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 @@ -40950,64 +40015,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -41019,12 +40084,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -41047,30 +40107,15 @@ ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1496: +; NoVLX-NEXT: .Lcfi1261: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1497: +; NoVLX-NEXT: .Lcfi1262: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1498: +; NoVLX-NEXT: .Lcfi1263: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1499: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1500: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1501: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1502: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1503: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 @@ -41084,64 +40129,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -41153,12 +40198,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -41183,30 +40223,15 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1504: +; NoVLX-NEXT: .Lcfi1264: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1505: +; NoVLX-NEXT: .Lcfi1265: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1506: +; NoVLX-NEXT: .Lcfi1266: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1507: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1508: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1509: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1510: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1511: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 @@ -41221,64 +40246,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -41290,12 +40315,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -41321,30 +40341,15 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1512: +; NoVLX-NEXT: .Lcfi1267: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1513: +; NoVLX-NEXT: .Lcfi1268: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1514: +; NoVLX-NEXT: .Lcfi1269: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1515: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1516: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1517: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1518: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1519: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 @@ -41359,64 +40364,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -41428,12 +40433,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -41460,62 +40460,58 @@ ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1520: +; NoVLX-NEXT: .Lcfi1270: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1521: +; NoVLX-NEXT: .Lcfi1271: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1522: +; NoVLX-NEXT: .Lcfi1272: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vmovd %ecx, %xmm5 @@ -41523,198 +40519,200 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm4 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 ; NoVLX-NEXT: vmovq %xmm6, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm5 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrq $1, %xmm6, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm6 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm8 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm7 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm0 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3 -; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 -; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor %ymm3, %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3 -; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 -; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 -; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -41779,9 +40777,11 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k1 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -41816,69 +40816,68 @@ ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1523: +; NoVLX-NEXT: .Lcfi1273: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1524: +; NoVLX-NEXT: .Lcfi1274: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1525: +; NoVLX-NEXT: .Lcfi1275: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: movq %rax, %rdx -; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 @@ -41886,7 +40885,8 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm0, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: movl %ecx, %eax @@ -41896,20 +40896,20 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0 ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm2 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 ; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3 @@ -42089,12 +41089,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1526: +; NoVLX-NEXT: .Lcfi1276: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1527: +; NoVLX-NEXT: .Lcfi1277: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1528: +; NoVLX-NEXT: .Lcfi1278: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -42105,180 +41105,179 @@ ; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: vmovd %eax, %xmm3 ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 -; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 -; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 -; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax ; NoVLX-NEXT: shrq $32, %rdx -; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vmovq %xmm3, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm4 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 -; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rdx, %rcx +; NoVLX-NEXT: shrq $48, %rdx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm6, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm4 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: movl %eax, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $6, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vmovq %xmm0, %rdx ; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 -; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %edx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: vmovd %edx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rdx, %rax ; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm0, %rax -; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $3, %edx, %xmm2, %xmm0 +; NoVLX-NEXT: movl %eax, %edx +; NoVLX-NEXT: shrl $16, %edx ; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 -; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $6, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: movl %ecx, %eax ; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm6 +; NoVLX-NEXT: vmovq %xmm6, %rax +; NoVLX-NEXT: vpextrq $1, %xmm7, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %edx, %ecx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm5, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm8 +; NoVLX-NEXT: vmovd %eax, %xmm7 +; NoVLX-NEXT: vpinsrw $1, %ecx, %xmm7, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm7, %xmm7 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx ; NoVLX-NEXT: vmovd %ecx, %xmm3 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; NoVLX-NEXT: vmovq %xmm8, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm5 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %edx +; NoVLX-NEXT: shrl $16, %edx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rdx +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rdx ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm1 +; NoVLX-NEXT: movl %edx, %ecx ; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: vpinsrw $4, %edx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rdx, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 -; NoVLX-NEXT: vmovq %xmm1, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 -; NoVLX-NEXT: movl %ecx, %eax -; NoVLX-NEXT: shrl $16, %eax -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rcx, %rax -; NoVLX-NEXT: shrq $32, %rax -; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpextrq $1, %xmm1, %rax -; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rdx +; NoVLX-NEXT: vpinsrw $7, %edx, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rcx +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm3 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 -; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm4 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm4, %ymm5, %ymm2 +; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm2 ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $3, %eax, %xmm7, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $4, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $6, %eax, %xmm5, %xmm5 ; NoVLX-NEXT: shrq $48, %rcx -; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movl %eax, %ecx -; NoVLX-NEXT: shrl $16, %ecx -; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: movq %rax, %rcx -; NoVLX-NEXT: shrq $48, %rax -; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] -; NoVLX-NEXT: vpxor %ymm5, %ymm6, %ymm3 -; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2 -; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm5, %xmm5 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 @@ -42342,84 +41341,85 @@ ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 -; NoVLX-NEXT: vpxor %ymm5, %ymm8, %ymm2 -; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 -; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm4, %ymm2 -; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 -; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor %ymm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm2 -; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax -; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vpand %xmm1, %xmm2, %xmm1 +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 ; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 ; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -42457,12 +41457,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1529: +; NoVLX-NEXT: .Lcfi1279: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1530: +; NoVLX-NEXT: .Lcfi1280: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1531: +; NoVLX-NEXT: .Lcfi1281: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -42474,8 +41474,6 @@ ; NoVLX-NEXT: vmovd %eax, %xmm2 ; NoVLX-NEXT: shrl $16, %eax ; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 -; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 ; NoVLX-NEXT: shrq $32, %rdx ; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 ; NoVLX-NEXT: vpextrq $1, %xmm1, %rax @@ -42488,7 +41486,8 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 -; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 ; NoVLX-NEXT: movl %ecx, %eax @@ -42498,7 +41497,7 @@ ; NoVLX-NEXT: movq %rcx, %rax ; NoVLX-NEXT: shrq $32, %rax ; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax ; NoVLX-NEXT: shrq $48, %rcx ; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 ; NoVLX-NEXT: movl %eax, %ecx @@ -42508,6 +41507,7 @@ ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx ; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 ; NoVLX-NEXT: vmovq %xmm3, %rcx ; NoVLX-NEXT: shrq $48, %rax ; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 @@ -42547,7 +41547,7 @@ ; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: movq %rax, %rcx ; NoVLX-NEXT: shrq $32, %rcx -; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm6 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm3 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -42555,9 +41555,9 @@ ; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 ; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 ; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm3 -; NoVLX-NEXT: vpinsrw $7, %eax, %xmm6, %xmm2 -; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm2 ; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2 ; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm5 @@ -42746,8 +41746,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -42804,8 +41804,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -42864,7 +41864,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -42877,13 +41876,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -42944,7 +41944,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -42957,13 +41956,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -43026,8 +42026,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -43088,7 +42088,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43101,13 +42100,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -43170,8 +42170,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -43227,8 +42227,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -43286,7 +42286,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43299,13 +42298,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -43365,7 +42365,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43378,13 +42377,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -43446,8 +42446,8 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -43507,7 +42507,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43520,13 +42519,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -43582,12 +42582,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1532: +; NoVLX-NEXT: .Lcfi1282: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1533: +; NoVLX-NEXT: .Lcfi1283: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1534: +; NoVLX-NEXT: .Lcfi1284: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43628,12 +42628,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1535: +; NoVLX-NEXT: .Lcfi1285: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1536: +; NoVLX-NEXT: .Lcfi1286: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1537: +; NoVLX-NEXT: .Lcfi1287: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43676,12 +42676,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1538: +; NoVLX-NEXT: .Lcfi1288: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1539: +; NoVLX-NEXT: .Lcfi1289: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1540: +; NoVLX-NEXT: .Lcfi1290: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43691,7 +42691,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43704,7 +42703,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -43744,12 +42744,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1541: +; NoVLX-NEXT: .Lcfi1291: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1542: +; NoVLX-NEXT: .Lcfi1292: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1543: +; NoVLX-NEXT: .Lcfi1293: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43759,7 +42759,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43772,7 +42771,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -43813,12 +42813,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1544: +; NoVLX-NEXT: .Lcfi1294: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1545: +; NoVLX-NEXT: .Lcfi1295: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1546: +; NoVLX-NEXT: .Lcfi1296: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43863,12 +42863,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1547: +; NoVLX-NEXT: .Lcfi1297: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1548: +; NoVLX-NEXT: .Lcfi1298: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1549: +; NoVLX-NEXT: .Lcfi1299: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -43879,7 +42879,6 @@ ; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -43892,7 +42891,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -43934,12 +42934,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1550: +; NoVLX-NEXT: .Lcfi1300: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1551: +; NoVLX-NEXT: .Lcfi1301: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1552: +; NoVLX-NEXT: .Lcfi1302: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -43986,19 +42986,19 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1553: +; NoVLX-NEXT: .Lcfi1303: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1554: +; NoVLX-NEXT: .Lcfi1304: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1555: +; NoVLX-NEXT: .Lcfi1305: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -44040,19 +43040,18 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1556: +; NoVLX-NEXT: .Lcfi1306: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1557: +; NoVLX-NEXT: .Lcfi1307: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1558: +; NoVLX-NEXT: .Lcfi1308: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -44064,13 +43063,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -44114,19 +43114,18 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1559: +; NoVLX-NEXT: .Lcfi1309: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1560: +; NoVLX-NEXT: .Lcfi1310: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1561: +; NoVLX-NEXT: .Lcfi1311: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -44138,13 +43137,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -44189,12 +43189,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1562: +; NoVLX-NEXT: .Lcfi1312: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1563: +; NoVLX-NEXT: .Lcfi1313: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1564: +; NoVLX-NEXT: .Lcfi1314: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44245,12 +43245,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1565: +; NoVLX-NEXT: .Lcfi1315: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1566: +; NoVLX-NEXT: .Lcfi1316: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1567: +; NoVLX-NEXT: .Lcfi1317: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -44258,7 +43258,6 @@ ; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -44270,13 +43269,14 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k3, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k2, %eax -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; NoVLX-NEXT: kmovw %k1, %eax -; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -44511,12 +43511,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1568: +; NoVLX-NEXT: .Lcfi1318: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1569: +; NoVLX-NEXT: .Lcfi1319: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1570: +; NoVLX-NEXT: .Lcfi1320: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -44527,35 +43527,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -44586,12 +43586,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1571: +; NoVLX-NEXT: .Lcfi1321: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1572: +; NoVLX-NEXT: .Lcfi1322: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1573: +; NoVLX-NEXT: .Lcfi1323: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -44602,35 +43602,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -44663,12 +43663,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1574: +; NoVLX-NEXT: .Lcfi1324: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1575: +; NoVLX-NEXT: .Lcfi1325: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1576: +; NoVLX-NEXT: .Lcfi1326: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -44681,35 +43681,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -44743,12 +43743,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1577: +; NoVLX-NEXT: .Lcfi1327: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1578: +; NoVLX-NEXT: .Lcfi1328: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1579: +; NoVLX-NEXT: .Lcfi1329: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -44761,35 +43761,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -44824,12 +43824,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1580: +; NoVLX-NEXT: .Lcfi1330: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1581: +; NoVLX-NEXT: .Lcfi1331: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1582: +; NoVLX-NEXT: .Lcfi1332: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -44840,35 +43840,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -44902,12 +43902,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1583: +; NoVLX-NEXT: .Lcfi1333: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1584: +; NoVLX-NEXT: .Lcfi1334: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1585: +; NoVLX-NEXT: .Lcfi1335: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -44920,35 +43920,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -44984,12 +43984,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1586: +; NoVLX-NEXT: .Lcfi1336: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1587: +; NoVLX-NEXT: .Lcfi1337: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1588: +; NoVLX-NEXT: .Lcfi1338: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -45064,12 +44064,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1589: +; NoVLX-NEXT: .Lcfi1339: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1590: +; NoVLX-NEXT: .Lcfi1340: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1591: +; NoVLX-NEXT: .Lcfi1341: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -45146,12 +44146,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1592: +; NoVLX-NEXT: .Lcfi1342: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1593: +; NoVLX-NEXT: .Lcfi1343: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1594: +; NoVLX-NEXT: .Lcfi1344: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -45231,12 +44231,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1595: +; NoVLX-NEXT: .Lcfi1345: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1596: +; NoVLX-NEXT: .Lcfi1346: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1597: +; NoVLX-NEXT: .Lcfi1347: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -45317,12 +44317,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1598: +; NoVLX-NEXT: .Lcfi1348: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1599: +; NoVLX-NEXT: .Lcfi1349: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1600: +; NoVLX-NEXT: .Lcfi1350: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -45400,12 +44400,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1601: +; NoVLX-NEXT: .Lcfi1351: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1602: +; NoVLX-NEXT: .Lcfi1352: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1603: +; NoVLX-NEXT: .Lcfi1353: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -45487,12 +44487,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1604: +; NoVLX-NEXT: .Lcfi1354: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1605: +; NoVLX-NEXT: .Lcfi1355: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1606: +; NoVLX-NEXT: .Lcfi1356: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45501,15 +44501,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1607: +; NoVLX-NEXT: .Lcfi1357: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1608: +; NoVLX-NEXT: .Lcfi1358: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1609: +; NoVLX-NEXT: .Lcfi1359: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1610: +; NoVLX-NEXT: .Lcfi1360: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1611: +; NoVLX-NEXT: .Lcfi1361: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -45572,9 +44572,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -45610,12 +44610,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1612: +; NoVLX-NEXT: .Lcfi1362: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1613: +; NoVLX-NEXT: .Lcfi1363: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1614: +; NoVLX-NEXT: .Lcfi1364: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45624,15 +44624,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1615: +; NoVLX-NEXT: .Lcfi1365: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1616: +; NoVLX-NEXT: .Lcfi1366: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1617: +; NoVLX-NEXT: .Lcfi1367: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1618: +; NoVLX-NEXT: .Lcfi1368: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1619: +; NoVLX-NEXT: .Lcfi1369: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -45695,9 +44695,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -45735,12 +44735,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1620: +; NoVLX-NEXT: .Lcfi1370: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1621: +; NoVLX-NEXT: .Lcfi1371: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1622: +; NoVLX-NEXT: .Lcfi1372: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45749,15 +44749,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1623: +; NoVLX-NEXT: .Lcfi1373: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1624: +; NoVLX-NEXT: .Lcfi1374: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1625: +; NoVLX-NEXT: .Lcfi1375: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1626: +; NoVLX-NEXT: .Lcfi1376: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1627: +; NoVLX-NEXT: .Lcfi1377: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} @@ -45821,9 +44821,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -45862,12 +44862,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1628: +; NoVLX-NEXT: .Lcfi1378: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1629: +; NoVLX-NEXT: .Lcfi1379: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1630: +; NoVLX-NEXT: .Lcfi1380: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -45876,15 +44876,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1631: +; NoVLX-NEXT: .Lcfi1381: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1632: +; NoVLX-NEXT: .Lcfi1382: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1633: +; NoVLX-NEXT: .Lcfi1383: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1634: +; NoVLX-NEXT: .Lcfi1384: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1635: +; NoVLX-NEXT: .Lcfi1385: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} @@ -45948,9 +44948,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -45990,12 +44990,12 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1636: +; NoVLX-NEXT: .Lcfi1386: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1637: +; NoVLX-NEXT: .Lcfi1387: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1638: +; NoVLX-NEXT: .Lcfi1388: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -46004,15 +45004,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1639: +; NoVLX-NEXT: .Lcfi1389: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1640: +; NoVLX-NEXT: .Lcfi1390: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1641: +; NoVLX-NEXT: .Lcfi1391: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1642: +; NoVLX-NEXT: .Lcfi1392: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1643: +; NoVLX-NEXT: .Lcfi1393: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -46075,9 +45075,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -46116,12 +45116,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1644: +; NoVLX-NEXT: .Lcfi1394: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1645: +; NoVLX-NEXT: .Lcfi1395: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1646: +; NoVLX-NEXT: .Lcfi1396: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -46130,15 +45130,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1647: +; NoVLX-NEXT: .Lcfi1397: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1648: +; NoVLX-NEXT: .Lcfi1398: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1649: +; NoVLX-NEXT: .Lcfi1399: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1650: +; NoVLX-NEXT: .Lcfi1400: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1651: +; NoVLX-NEXT: .Lcfi1401: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} @@ -46202,9 +45202,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -46245,30 +45245,15 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1652: +; NoVLX-NEXT: .Lcfi1402: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1653: +; NoVLX-NEXT: .Lcfi1403: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1654: +; NoVLX-NEXT: .Lcfi1404: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1655: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1656: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1657: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1658: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1659: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -46276,64 +45261,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -46345,12 +45330,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -46373,30 +45353,15 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1660: +; NoVLX-NEXT: .Lcfi1405: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1661: +; NoVLX-NEXT: .Lcfi1406: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1662: +; NoVLX-NEXT: .Lcfi1407: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1663: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1664: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1665: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1666: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1667: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -46404,64 +45369,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -46473,12 +45438,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -46503,30 +45463,15 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1668: +; NoVLX-NEXT: .Lcfi1408: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1669: +; NoVLX-NEXT: .Lcfi1409: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1670: +; NoVLX-NEXT: .Lcfi1410: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1671: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1672: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1673: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1674: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1675: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -46535,64 +45480,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -46604,12 +45549,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -46635,30 +45575,15 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1676: +; NoVLX-NEXT: .Lcfi1411: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1677: +; NoVLX-NEXT: .Lcfi1412: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1678: +; NoVLX-NEXT: .Lcfi1413: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1679: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1680: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1681: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1682: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1683: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -46667,64 +45592,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -46736,12 +45661,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -46768,30 +45688,15 @@ ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1684: +; NoVLX-NEXT: .Lcfi1414: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1685: +; NoVLX-NEXT: .Lcfi1415: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1686: +; NoVLX-NEXT: .Lcfi1416: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1687: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1688: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1689: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1690: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1691: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -46799,64 +45704,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -46868,12 +45773,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -46899,30 +45799,15 @@ ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1692: +; NoVLX-NEXT: .Lcfi1417: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1693: +; NoVLX-NEXT: .Lcfi1418: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1694: +; NoVLX-NEXT: .Lcfi1419: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1695: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1696: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1697: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1698: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1699: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -46931,64 +45816,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -47000,12 +45885,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -47063,9 +45943,9 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -47097,7 +45977,6 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47105,9 +45984,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -47141,7 +46021,6 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47149,9 +46028,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -47221,7 +46101,6 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -47229,9 +46108,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -47270,8 +46150,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47279,9 +46159,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47312,8 +46192,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47321,9 +46201,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47366,8 +46246,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47375,9 +46255,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47422,8 +46302,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47431,9 +46311,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47470,8 +46350,8 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47479,9 +46359,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47526,8 +46406,8 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -47535,9 +46415,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47574,17 +46454,17 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47615,17 +46495,17 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47668,17 +46548,17 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47723,17 +46603,17 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47770,17 +46650,17 @@ ; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47825,17 +46705,17 @@ ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -47866,12 +46746,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1700: +; NoVLX-NEXT: .Lcfi1420: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1701: +; NoVLX-NEXT: .Lcfi1421: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1702: +; NoVLX-NEXT: .Lcfi1422: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -47912,12 +46792,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1703: +; NoVLX-NEXT: .Lcfi1423: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1704: +; NoVLX-NEXT: .Lcfi1424: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1705: +; NoVLX-NEXT: .Lcfi1425: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -47960,12 +46840,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1706: +; NoVLX-NEXT: .Lcfi1426: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1707: +; NoVLX-NEXT: .Lcfi1427: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1708: +; NoVLX-NEXT: .Lcfi1428: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48020,12 +46900,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1709: +; NoVLX-NEXT: .Lcfi1429: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1710: +; NoVLX-NEXT: .Lcfi1430: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1711: +; NoVLX-NEXT: .Lcfi1431: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48081,12 +46961,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1712: +; NoVLX-NEXT: .Lcfi1432: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1713: +; NoVLX-NEXT: .Lcfi1433: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1714: +; NoVLX-NEXT: .Lcfi1434: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48131,12 +47011,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1715: +; NoVLX-NEXT: .Lcfi1435: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1716: +; NoVLX-NEXT: .Lcfi1436: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1717: +; NoVLX-NEXT: .Lcfi1437: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -48194,12 +47074,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1718: +; NoVLX-NEXT: .Lcfi1438: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1719: +; NoVLX-NEXT: .Lcfi1439: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1720: +; NoVLX-NEXT: .Lcfi1440: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48246,19 +47126,19 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1721: +; NoVLX-NEXT: .Lcfi1441: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1722: +; NoVLX-NEXT: .Lcfi1442: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1723: +; NoVLX-NEXT: .Lcfi1443: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -48300,19 +47180,18 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1724: +; NoVLX-NEXT: .Lcfi1444: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1725: +; NoVLX-NEXT: .Lcfi1445: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1726: +; NoVLX-NEXT: .Lcfi1446: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48320,9 +47199,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -48366,19 +47246,18 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1727: +; NoVLX-NEXT: .Lcfi1447: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1728: +; NoVLX-NEXT: .Lcfi1448: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1729: +; NoVLX-NEXT: .Lcfi1449: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48386,9 +47265,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -48433,12 +47313,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1730: +; NoVLX-NEXT: .Lcfi1450: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1731: +; NoVLX-NEXT: .Lcfi1451: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1732: +; NoVLX-NEXT: .Lcfi1452: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48489,12 +47369,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1733: +; NoVLX-NEXT: .Lcfi1453: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1734: +; NoVLX-NEXT: .Lcfi1454: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1735: +; NoVLX-NEXT: .Lcfi1455: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -48502,7 +47382,6 @@ ; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -48510,9 +47389,10 @@ ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vmovd %ecx, %xmm1 -; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpand %xmm0, %xmm2, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -48566,8 +47446,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -48626,8 +47506,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -48688,7 +47568,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -48701,13 +47580,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -48770,7 +47650,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -48783,13 +47662,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -48854,8 +47734,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -48918,7 +47798,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -48931,13 +47810,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -49002,8 +47882,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -49061,8 +47941,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -49122,7 +48002,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -49135,13 +48014,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -49203,7 +48083,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -49216,13 +48095,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -49286,8 +48166,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -49349,7 +48229,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -49362,13 +48241,14 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -49425,12 +48305,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1736: +; NoVLX-NEXT: .Lcfi1456: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1737: +; NoVLX-NEXT: .Lcfi1457: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1738: +; NoVLX-NEXT: .Lcfi1458: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49473,12 +48353,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1739: +; NoVLX-NEXT: .Lcfi1459: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1740: +; NoVLX-NEXT: .Lcfi1460: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1741: +; NoVLX-NEXT: .Lcfi1461: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49523,12 +48403,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1742: +; NoVLX-NEXT: .Lcfi1462: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1743: +; NoVLX-NEXT: .Lcfi1463: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1744: +; NoVLX-NEXT: .Lcfi1464: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49539,7 +48419,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -49552,7 +48431,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -49593,12 +48473,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1745: +; NoVLX-NEXT: .Lcfi1465: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1746: +; NoVLX-NEXT: .Lcfi1466: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1747: +; NoVLX-NEXT: .Lcfi1467: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49609,7 +48489,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -49622,7 +48501,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -49664,12 +48544,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1748: +; NoVLX-NEXT: .Lcfi1468: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1749: +; NoVLX-NEXT: .Lcfi1469: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1750: +; NoVLX-NEXT: .Lcfi1470: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49716,12 +48596,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1751: +; NoVLX-NEXT: .Lcfi1471: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1752: +; NoVLX-NEXT: .Lcfi1472: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1753: +; NoVLX-NEXT: .Lcfi1473: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -49733,7 +48613,6 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kshiftlw $13, %k0, %k2 ; NoVLX-NEXT: kshiftrw $15, %k2, %k2 ; NoVLX-NEXT: kshiftlw $15, %k0, %k3 @@ -49746,7 +48625,8 @@ ; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k2, %eax ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftrw $15, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -49789,12 +48669,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1754: +; NoVLX-NEXT: .Lcfi1474: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1755: +; NoVLX-NEXT: .Lcfi1475: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1756: +; NoVLX-NEXT: .Lcfi1476: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -49843,12 +48723,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1757: +; NoVLX-NEXT: .Lcfi1477: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1758: +; NoVLX-NEXT: .Lcfi1478: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1759: +; NoVLX-NEXT: .Lcfi1479: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -49899,12 +48779,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1760: +; NoVLX-NEXT: .Lcfi1480: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1761: +; NoVLX-NEXT: .Lcfi1481: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1762: +; NoVLX-NEXT: .Lcfi1482: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -49912,7 +48792,6 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -49930,6 +48809,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -49975,20 +48855,19 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1763: +; NoVLX-NEXT: .Lcfi1483: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1764: +; NoVLX-NEXT: .Lcfi1484: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1765: +; NoVLX-NEXT: .Lcfi1485: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp ; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm2 ; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 -; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -50006,6 +48885,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -50052,12 +48932,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1766: +; NoVLX-NEXT: .Lcfi1486: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1767: +; NoVLX-NEXT: .Lcfi1487: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1768: +; NoVLX-NEXT: .Lcfi1488: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -50110,12 +48990,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1769: +; NoVLX-NEXT: .Lcfi1489: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1770: +; NoVLX-NEXT: .Lcfi1490: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1771: +; NoVLX-NEXT: .Lcfi1491: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -50124,7 +49004,6 @@ ; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 -; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: kmovw %edi, %k0 ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 @@ -50142,6 +49021,7 @@ ; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -50353,12 +49233,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1772: +; NoVLX-NEXT: .Lcfi1492: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1773: +; NoVLX-NEXT: .Lcfi1493: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1774: +; NoVLX-NEXT: .Lcfi1494: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50367,35 +49247,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -50426,12 +49306,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1775: +; NoVLX-NEXT: .Lcfi1495: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1776: +; NoVLX-NEXT: .Lcfi1496: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1777: +; NoVLX-NEXT: .Lcfi1497: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50440,35 +49320,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -50501,12 +49381,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1778: +; NoVLX-NEXT: .Lcfi1498: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1779: +; NoVLX-NEXT: .Lcfi1499: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1780: +; NoVLX-NEXT: .Lcfi1500: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50516,35 +49396,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -50578,12 +49458,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1781: +; NoVLX-NEXT: .Lcfi1501: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1782: +; NoVLX-NEXT: .Lcfi1502: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1783: +; NoVLX-NEXT: .Lcfi1503: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50593,35 +49473,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -50656,12 +49536,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1784: +; NoVLX-NEXT: .Lcfi1504: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1785: +; NoVLX-NEXT: .Lcfi1505: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1786: +; NoVLX-NEXT: .Lcfi1506: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50670,35 +49550,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -50732,12 +49612,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1787: +; NoVLX-NEXT: .Lcfi1507: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1788: +; NoVLX-NEXT: .Lcfi1508: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1789: +; NoVLX-NEXT: .Lcfi1509: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -50747,35 +49627,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -50811,12 +49691,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1790: +; NoVLX-NEXT: .Lcfi1510: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1791: +; NoVLX-NEXT: .Lcfi1511: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1792: +; NoVLX-NEXT: .Lcfi1512: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -50889,12 +49769,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1793: +; NoVLX-NEXT: .Lcfi1513: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1794: +; NoVLX-NEXT: .Lcfi1514: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1795: +; NoVLX-NEXT: .Lcfi1515: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -50969,12 +49849,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1796: +; NoVLX-NEXT: .Lcfi1516: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1797: +; NoVLX-NEXT: .Lcfi1517: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1798: +; NoVLX-NEXT: .Lcfi1518: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51051,12 +49931,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1799: +; NoVLX-NEXT: .Lcfi1519: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1800: +; NoVLX-NEXT: .Lcfi1520: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1801: +; NoVLX-NEXT: .Lcfi1521: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51134,12 +50014,12 @@ ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1802: +; NoVLX-NEXT: .Lcfi1522: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1803: +; NoVLX-NEXT: .Lcfi1523: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1804: +; NoVLX-NEXT: .Lcfi1524: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51215,12 +50095,12 @@ ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1805: +; NoVLX-NEXT: .Lcfi1525: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1806: +; NoVLX-NEXT: .Lcfi1526: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1807: +; NoVLX-NEXT: .Lcfi1527: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -51302,8 +50182,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -51357,8 +50237,8 @@ ; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -51414,8 +50294,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -51479,8 +50359,8 @@ ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -51544,8 +50424,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -51611,8 +50491,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -51672,8 +50552,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -51726,8 +50606,8 @@ ; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -51782,8 +50662,8 @@ ; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -51846,8 +50726,8 @@ ; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -51910,8 +50790,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -51976,8 +50856,8 @@ ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -52033,12 +50913,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1808: +; NoVLX-NEXT: .Lcfi1528: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1809: +; NoVLX-NEXT: .Lcfi1529: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1810: +; NoVLX-NEXT: .Lcfi1530: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52076,12 +50956,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1811: +; NoVLX-NEXT: .Lcfi1531: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1812: +; NoVLX-NEXT: .Lcfi1532: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1813: +; NoVLX-NEXT: .Lcfi1533: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52120,12 +51000,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1814: +; NoVLX-NEXT: .Lcfi1534: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1815: +; NoVLX-NEXT: .Lcfi1535: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1816: +; NoVLX-NEXT: .Lcfi1536: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52168,12 +51048,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1817: +; NoVLX-NEXT: .Lcfi1537: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1818: +; NoVLX-NEXT: .Lcfi1538: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1819: +; NoVLX-NEXT: .Lcfi1539: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52221,12 +51101,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1820: +; NoVLX-NEXT: .Lcfi1540: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1821: +; NoVLX-NEXT: .Lcfi1541: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1822: +; NoVLX-NEXT: .Lcfi1542: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52275,12 +51155,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1823: +; NoVLX-NEXT: .Lcfi1543: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1824: +; NoVLX-NEXT: .Lcfi1544: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1825: +; NoVLX-NEXT: .Lcfi1545: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52331,12 +51211,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1826: +; NoVLX-NEXT: .Lcfi1546: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1827: +; NoVLX-NEXT: .Lcfi1547: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1828: +; NoVLX-NEXT: .Lcfi1548: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52380,12 +51260,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1829: +; NoVLX-NEXT: .Lcfi1549: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1830: +; NoVLX-NEXT: .Lcfi1550: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1831: +; NoVLX-NEXT: .Lcfi1551: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52430,12 +51310,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1832: +; NoVLX-NEXT: .Lcfi1552: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1833: +; NoVLX-NEXT: .Lcfi1553: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1834: +; NoVLX-NEXT: .Lcfi1554: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -52484,12 +51364,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1835: +; NoVLX-NEXT: .Lcfi1555: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1836: +; NoVLX-NEXT: .Lcfi1556: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1837: +; NoVLX-NEXT: .Lcfi1557: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -52543,12 +51423,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1838: +; NoVLX-NEXT: .Lcfi1558: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1839: +; NoVLX-NEXT: .Lcfi1559: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1840: +; NoVLX-NEXT: .Lcfi1560: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -52556,8 +51436,8 @@ ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 ; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 ; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovqd %zmm1, %ymm1 ; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -52603,12 +51483,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1841: +; NoVLX-NEXT: .Lcfi1561: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1842: +; NoVLX-NEXT: .Lcfi1562: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1843: +; NoVLX-NEXT: .Lcfi1563: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -52854,12 +51734,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1844: +; NoVLX-NEXT: .Lcfi1564: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1845: +; NoVLX-NEXT: .Lcfi1565: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1846: +; NoVLX-NEXT: .Lcfi1566: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52870,35 +51750,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -52929,12 +51809,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1847: +; NoVLX-NEXT: .Lcfi1567: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1848: +; NoVLX-NEXT: .Lcfi1568: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1849: +; NoVLX-NEXT: .Lcfi1569: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -52945,35 +51825,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -53005,12 +51885,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1850: +; NoVLX-NEXT: .Lcfi1570: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1851: +; NoVLX-NEXT: .Lcfi1571: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1852: +; NoVLX-NEXT: .Lcfi1572: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -53021,35 +51901,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -53083,12 +51963,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1853: +; NoVLX-NEXT: .Lcfi1573: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1854: +; NoVLX-NEXT: .Lcfi1574: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1855: +; NoVLX-NEXT: .Lcfi1575: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -53101,35 +51981,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -53163,12 +52043,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1856: +; NoVLX-NEXT: .Lcfi1576: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1857: +; NoVLX-NEXT: .Lcfi1577: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1858: +; NoVLX-NEXT: .Lcfi1578: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -53181,35 +52061,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -53244,12 +52124,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1859: +; NoVLX-NEXT: .Lcfi1579: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1860: +; NoVLX-NEXT: .Lcfi1580: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1861: +; NoVLX-NEXT: .Lcfi1581: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -53262,35 +52142,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -53327,12 +52207,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1862: +; NoVLX-NEXT: .Lcfi1582: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1863: +; NoVLX-NEXT: .Lcfi1583: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1864: +; NoVLX-NEXT: .Lcfi1584: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -53407,12 +52287,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1865: +; NoVLX-NEXT: .Lcfi1585: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1866: +; NoVLX-NEXT: .Lcfi1586: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1867: +; NoVLX-NEXT: .Lcfi1587: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -53488,12 +52368,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1868: +; NoVLX-NEXT: .Lcfi1588: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1869: +; NoVLX-NEXT: .Lcfi1589: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1870: +; NoVLX-NEXT: .Lcfi1590: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -53571,12 +52451,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1871: +; NoVLX-NEXT: .Lcfi1591: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1872: +; NoVLX-NEXT: .Lcfi1592: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1873: +; NoVLX-NEXT: .Lcfi1593: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -53656,12 +52536,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1874: +; NoVLX-NEXT: .Lcfi1594: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1875: +; NoVLX-NEXT: .Lcfi1595: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1876: +; NoVLX-NEXT: .Lcfi1596: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -53742,12 +52622,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1877: +; NoVLX-NEXT: .Lcfi1597: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1878: +; NoVLX-NEXT: .Lcfi1598: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1879: +; NoVLX-NEXT: .Lcfi1599: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -53830,12 +52710,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1880: +; NoVLX-NEXT: .Lcfi1600: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1881: +; NoVLX-NEXT: .Lcfi1601: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1882: +; NoVLX-NEXT: .Lcfi1602: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53844,15 +52724,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1883: +; NoVLX-NEXT: .Lcfi1603: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1884: +; NoVLX-NEXT: .Lcfi1604: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1885: +; NoVLX-NEXT: .Lcfi1605: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1886: +; NoVLX-NEXT: .Lcfi1606: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1887: +; NoVLX-NEXT: .Lcfi1607: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -53915,9 +52795,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -53953,12 +52833,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1888: +; NoVLX-NEXT: .Lcfi1608: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1889: +; NoVLX-NEXT: .Lcfi1609: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1890: +; NoVLX-NEXT: .Lcfi1610: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -53967,15 +52847,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1891: +; NoVLX-NEXT: .Lcfi1611: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1892: +; NoVLX-NEXT: .Lcfi1612: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1893: +; NoVLX-NEXT: .Lcfi1613: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1894: +; NoVLX-NEXT: .Lcfi1614: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1895: +; NoVLX-NEXT: .Lcfi1615: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -54038,9 +52918,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -54077,12 +52957,12 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1896: +; NoVLX-NEXT: .Lcfi1616: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1897: +; NoVLX-NEXT: .Lcfi1617: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1898: +; NoVLX-NEXT: .Lcfi1618: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -54091,15 +52971,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1899: +; NoVLX-NEXT: .Lcfi1619: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1900: +; NoVLX-NEXT: .Lcfi1620: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1901: +; NoVLX-NEXT: .Lcfi1621: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1902: +; NoVLX-NEXT: .Lcfi1622: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1903: +; NoVLX-NEXT: .Lcfi1623: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -54162,9 +53042,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -54203,12 +53083,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1904: +; NoVLX-NEXT: .Lcfi1624: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1905: +; NoVLX-NEXT: .Lcfi1625: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1906: +; NoVLX-NEXT: .Lcfi1626: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -54217,15 +53097,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1907: +; NoVLX-NEXT: .Lcfi1627: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1908: +; NoVLX-NEXT: .Lcfi1628: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1909: +; NoVLX-NEXT: .Lcfi1629: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1910: +; NoVLX-NEXT: .Lcfi1630: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1911: +; NoVLX-NEXT: .Lcfi1631: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} @@ -54289,9 +53169,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -54330,12 +53210,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1912: +; NoVLX-NEXT: .Lcfi1632: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1913: +; NoVLX-NEXT: .Lcfi1633: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1914: +; NoVLX-NEXT: .Lcfi1634: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -54344,15 +53224,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1915: +; NoVLX-NEXT: .Lcfi1635: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1916: +; NoVLX-NEXT: .Lcfi1636: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1917: +; NoVLX-NEXT: .Lcfi1637: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1918: +; NoVLX-NEXT: .Lcfi1638: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1919: +; NoVLX-NEXT: .Lcfi1639: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} @@ -54416,9 +53296,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -54458,12 +53338,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1920: +; NoVLX-NEXT: .Lcfi1640: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1921: +; NoVLX-NEXT: .Lcfi1641: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1922: +; NoVLX-NEXT: .Lcfi1642: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: pushq %r15 ; NoVLX-NEXT: pushq %r14 @@ -54472,15 +53352,15 @@ ; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp -; NoVLX-NEXT: .Lcfi1923: +; NoVLX-NEXT: .Lcfi1643: ; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1924: +; NoVLX-NEXT: .Lcfi1644: ; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1925: +; NoVLX-NEXT: .Lcfi1645: ; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1926: +; NoVLX-NEXT: .Lcfi1646: ; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1927: +; NoVLX-NEXT: .Lcfi1647: ; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} @@ -54544,9 +53424,9 @@ ; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -54634,30 +53514,15 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1928: +; NoVLX-NEXT: .Lcfi1648: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1929: +; NoVLX-NEXT: .Lcfi1649: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1930: +; NoVLX-NEXT: .Lcfi1650: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1931: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1932: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1933: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1934: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1935: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -54665,64 +53530,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -54734,12 +53599,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -54762,30 +53622,15 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1936: +; NoVLX-NEXT: .Lcfi1651: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1937: +; NoVLX-NEXT: .Lcfi1652: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1938: +; NoVLX-NEXT: .Lcfi1653: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1939: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1940: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1941: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1942: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1943: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -54793,64 +53638,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -54862,12 +53707,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -54891,30 +53731,15 @@ ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1944: +; NoVLX-NEXT: .Lcfi1654: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1945: +; NoVLX-NEXT: .Lcfi1655: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1946: +; NoVLX-NEXT: .Lcfi1656: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1947: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1948: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1949: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1950: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1951: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) @@ -54922,64 +53747,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -54991,12 +53816,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -55022,30 +53842,15 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1952: +; NoVLX-NEXT: .Lcfi1657: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1953: +; NoVLX-NEXT: .Lcfi1658: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1954: +; NoVLX-NEXT: .Lcfi1659: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1955: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1956: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1957: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1958: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1959: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -55054,64 +53859,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -55123,12 +53928,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -55154,30 +53954,15 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1960: +; NoVLX-NEXT: .Lcfi1660: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1961: +; NoVLX-NEXT: .Lcfi1661: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1962: +; NoVLX-NEXT: .Lcfi1662: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1963: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1964: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1965: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1966: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1967: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -55186,64 +53971,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -55255,12 +54040,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -55287,30 +54067,15 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1968: +; NoVLX-NEXT: .Lcfi1663: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1969: +; NoVLX-NEXT: .Lcfi1664: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1970: +; NoVLX-NEXT: .Lcfi1665: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp -; NoVLX-NEXT: pushq %r15 -; NoVLX-NEXT: pushq %r14 -; NoVLX-NEXT: pushq %r13 -; NoVLX-NEXT: pushq %r12 -; NoVLX-NEXT: pushq %rbx ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp -; NoVLX-NEXT: .Lcfi1971: -; NoVLX-NEXT: .cfi_offset %rbx, -56 -; NoVLX-NEXT: .Lcfi1972: -; NoVLX-NEXT: .cfi_offset %r12, -48 -; NoVLX-NEXT: .Lcfi1973: -; NoVLX-NEXT: .cfi_offset %r13, -40 -; NoVLX-NEXT: .Lcfi1974: -; NoVLX-NEXT: .cfi_offset %r14, -32 -; NoVLX-NEXT: .Lcfi1975: -; NoVLX-NEXT: .cfi_offset %r15, -24 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -55319,64 +54084,64 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vmovd %r10d, %xmm0 -; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 @@ -55388,12 +54153,7 @@ ; NoVLX-NEXT: shlq $32, %rcx ; NoVLX-NEXT: movl (%rsp), %eax ; NoVLX-NEXT: orq %rcx, %rax -; NoVLX-NEXT: leaq -40(%rbp), %rsp -; NoVLX-NEXT: popq %rbx -; NoVLX-NEXT: popq %r12 -; NoVLX-NEXT: popq %r13 -; NoVLX-NEXT: popq %r14 -; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: movq %rbp, %rsp ; NoVLX-NEXT: popq %rbp ; NoVLX-NEXT: retq entry: @@ -55598,8 +54358,8 @@ ; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero ; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 @@ -55674,8 +54434,8 @@ ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55683,9 +54443,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -55713,8 +54473,8 @@ ; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55722,9 +54482,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -55754,8 +54514,8 @@ ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55763,9 +54523,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -55802,8 +54562,8 @@ ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55811,9 +54571,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -55850,8 +54610,8 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55859,9 +54619,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -55900,8 +54660,8 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -55909,9 +54669,9 @@ ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -55945,17 +54705,17 @@ ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -55983,17 +54743,17 @@ ; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -56023,17 +54783,17 @@ ; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -56070,17 +54830,17 @@ ; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -56117,17 +54877,17 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -56166,17 +54926,17 @@ ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $8, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 ; NoVLX-NEXT: kshiftrw $1, %k1, %k1 ; NoVLX-NEXT: kshiftlw $1, %k1, %k1 ; NoVLX-NEXT: korw %k0, %k1, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax @@ -56207,12 +54967,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1976: +; NoVLX-NEXT: .Lcfi1666: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1977: +; NoVLX-NEXT: .Lcfi1667: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1978: +; NoVLX-NEXT: .Lcfi1668: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56250,12 +55010,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1979: +; NoVLX-NEXT: .Lcfi1669: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1980: +; NoVLX-NEXT: .Lcfi1670: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1981: +; NoVLX-NEXT: .Lcfi1671: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56294,12 +55054,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1982: +; NoVLX-NEXT: .Lcfi1672: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1983: +; NoVLX-NEXT: .Lcfi1673: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1984: +; NoVLX-NEXT: .Lcfi1674: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -56342,12 +55102,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1985: +; NoVLX-NEXT: .Lcfi1675: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1986: +; NoVLX-NEXT: .Lcfi1676: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1987: +; NoVLX-NEXT: .Lcfi1677: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56394,12 +55154,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1988: +; NoVLX-NEXT: .Lcfi1678: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1989: +; NoVLX-NEXT: .Lcfi1679: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1990: +; NoVLX-NEXT: .Lcfi1680: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56447,12 +55207,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1991: +; NoVLX-NEXT: .Lcfi1681: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1992: +; NoVLX-NEXT: .Lcfi1682: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1993: +; NoVLX-NEXT: .Lcfi1683: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56502,12 +55262,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1994: +; NoVLX-NEXT: .Lcfi1684: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1995: +; NoVLX-NEXT: .Lcfi1685: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1996: +; NoVLX-NEXT: .Lcfi1686: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56551,12 +55311,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi1997: +; NoVLX-NEXT: .Lcfi1687: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi1998: +; NoVLX-NEXT: .Lcfi1688: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi1999: +; NoVLX-NEXT: .Lcfi1689: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56601,12 +55361,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2000: +; NoVLX-NEXT: .Lcfi1690: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2001: +; NoVLX-NEXT: .Lcfi1691: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2002: +; NoVLX-NEXT: .Lcfi1692: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -56655,12 +55415,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2003: +; NoVLX-NEXT: .Lcfi1693: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2004: +; NoVLX-NEXT: .Lcfi1694: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2005: +; NoVLX-NEXT: .Lcfi1695: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56713,20 +55473,20 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2006: +; NoVLX-NEXT: .Lcfi1696: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2007: +; NoVLX-NEXT: .Lcfi1697: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2008: +; NoVLX-NEXT: .Lcfi1698: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp ; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) @@ -56772,12 +55532,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2009: +; NoVLX-NEXT: .Lcfi1699: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2010: +; NoVLX-NEXT: .Lcfi1700: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2011: +; NoVLX-NEXT: .Lcfi1701: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -56838,8 +55598,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -56895,8 +55655,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -56954,8 +55714,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -57021,8 +55781,8 @@ ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -57088,8 +55848,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -57157,8 +55917,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kshiftlw $7, %k0, %k0 ; NoVLX-NEXT: kshiftrw $7, %k0, %k0 @@ -57220,8 +55980,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -57276,8 +56036,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -57334,8 +56094,8 @@ ; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -57400,8 +56160,8 @@ ; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -57466,8 +56226,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -57534,8 +56294,8 @@ ; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpextrb $4, %xmm0, %eax ; NoVLX-NEXT: kmovw %eax, %k1 -; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; NoVLX-NEXT: andl $1, %eax ; NoVLX-NEXT: kmovw %eax, %k0 ; NoVLX-NEXT: kxorw %k0, %k0, %k1 @@ -57592,12 +56352,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2012: +; NoVLX-NEXT: .Lcfi1702: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2013: +; NoVLX-NEXT: .Lcfi1703: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2014: +; NoVLX-NEXT: .Lcfi1704: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57637,12 +56397,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2015: +; NoVLX-NEXT: .Lcfi1705: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2016: +; NoVLX-NEXT: .Lcfi1706: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2017: +; NoVLX-NEXT: .Lcfi1707: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57683,12 +56443,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2018: +; NoVLX-NEXT: .Lcfi1708: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2019: +; NoVLX-NEXT: .Lcfi1709: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2020: +; NoVLX-NEXT: .Lcfi1710: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -57733,12 +56493,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2021: +; NoVLX-NEXT: .Lcfi1711: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2022: +; NoVLX-NEXT: .Lcfi1712: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2023: +; NoVLX-NEXT: .Lcfi1713: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -57788,12 +56548,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2024: +; NoVLX-NEXT: .Lcfi1714: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2025: +; NoVLX-NEXT: .Lcfi1715: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2026: +; NoVLX-NEXT: .Lcfi1716: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -57844,12 +56604,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2027: +; NoVLX-NEXT: .Lcfi1717: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2028: +; NoVLX-NEXT: .Lcfi1718: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2029: +; NoVLX-NEXT: .Lcfi1719: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -57902,12 +56662,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2030: +; NoVLX-NEXT: .Lcfi1720: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2031: +; NoVLX-NEXT: .Lcfi1721: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2032: +; NoVLX-NEXT: .Lcfi1722: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -57953,12 +56713,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2033: +; NoVLX-NEXT: .Lcfi1723: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2034: +; NoVLX-NEXT: .Lcfi1724: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2035: +; NoVLX-NEXT: .Lcfi1725: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -58005,12 +56765,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2036: +; NoVLX-NEXT: .Lcfi1726: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2037: +; NoVLX-NEXT: .Lcfi1727: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2038: +; NoVLX-NEXT: .Lcfi1728: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -58061,12 +56821,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2039: +; NoVLX-NEXT: .Lcfi1729: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2040: +; NoVLX-NEXT: .Lcfi1730: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2041: +; NoVLX-NEXT: .Lcfi1731: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -58122,12 +56882,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2042: +; NoVLX-NEXT: .Lcfi1732: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2043: +; NoVLX-NEXT: .Lcfi1733: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2044: +; NoVLX-NEXT: .Lcfi1734: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -58184,12 +56944,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2045: +; NoVLX-NEXT: .Lcfi1735: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2046: +; NoVLX-NEXT: .Lcfi1736: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2047: +; NoVLX-NEXT: .Lcfi1737: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $96, %rsp @@ -58466,12 +57226,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2048: +; NoVLX-NEXT: .Lcfi1738: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2049: +; NoVLX-NEXT: .Lcfi1739: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2050: +; NoVLX-NEXT: .Lcfi1740: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -58480,35 +57240,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -58539,12 +57299,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2051: +; NoVLX-NEXT: .Lcfi1741: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2052: +; NoVLX-NEXT: .Lcfi1742: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2053: +; NoVLX-NEXT: .Lcfi1743: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -58553,35 +57313,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -58613,12 +57373,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2054: +; NoVLX-NEXT: .Lcfi1744: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2055: +; NoVLX-NEXT: .Lcfi1745: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2056: +; NoVLX-NEXT: .Lcfi1746: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -58627,35 +57387,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -58689,12 +57449,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2057: +; NoVLX-NEXT: .Lcfi1747: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2058: +; NoVLX-NEXT: .Lcfi1748: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2059: +; NoVLX-NEXT: .Lcfi1749: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -58704,35 +57464,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 -; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 ; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -58766,12 +57526,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2060: +; NoVLX-NEXT: .Lcfi1750: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2061: +; NoVLX-NEXT: .Lcfi1751: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2062: +; NoVLX-NEXT: .Lcfi1752: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -58781,35 +57541,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -58844,12 +57604,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2063: +; NoVLX-NEXT: .Lcfi1753: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2064: +; NoVLX-NEXT: .Lcfi1754: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2065: +; NoVLX-NEXT: .Lcfi1755: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $32, %rsp @@ -58859,35 +57619,35 @@ ; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kshiftlw $15, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $14, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $13, %k0, %k1 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 ; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kmovw %k1, %eax ; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kmovw %k1, %ecx ; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kmovw %k1, %edx ; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k1, %k1 -; NoVLX-NEXT: kmovw %k1, %ecx -; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: kshiftrw $15, %k0, %k0 -; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 @@ -58972,12 +57732,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2066: +; NoVLX-NEXT: .Lcfi1756: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2067: +; NoVLX-NEXT: .Lcfi1757: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2068: +; NoVLX-NEXT: .Lcfi1758: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -59050,12 +57810,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2069: +; NoVLX-NEXT: .Lcfi1759: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2070: +; NoVLX-NEXT: .Lcfi1760: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2071: +; NoVLX-NEXT: .Lcfi1761: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -59129,12 +57889,12 @@ ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2072: +; NoVLX-NEXT: .Lcfi1762: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2073: +; NoVLX-NEXT: .Lcfi1763: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2074: +; NoVLX-NEXT: .Lcfi1764: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -59210,12 +57970,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2075: +; NoVLX-NEXT: .Lcfi1765: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2076: +; NoVLX-NEXT: .Lcfi1766: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2077: +; NoVLX-NEXT: .Lcfi1767: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -59292,12 +58052,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2078: +; NoVLX-NEXT: .Lcfi1768: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2079: +; NoVLX-NEXT: .Lcfi1769: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2080: +; NoVLX-NEXT: .Lcfi1770: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp @@ -59375,12 +58135,12 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b: ; NoVLX: # BB#0: # %entry ; NoVLX-NEXT: pushq %rbp -; NoVLX-NEXT: .Lcfi2081: +; NoVLX-NEXT: .Lcfi1771: ; NoVLX-NEXT: .cfi_def_cfa_offset 16 -; NoVLX-NEXT: .Lcfi2082: +; NoVLX-NEXT: .Lcfi1772: ; NoVLX-NEXT: .cfi_offset %rbp, -16 ; NoVLX-NEXT: movq %rsp, %rbp -; NoVLX-NEXT: .Lcfi2083: +; NoVLX-NEXT: .Lcfi1773: ; NoVLX-NEXT: .cfi_def_cfa_register %rbp ; NoVLX-NEXT: andq $-32, %rsp ; NoVLX-NEXT: subq $64, %rsp Index: test/CodeGen/X86/bmi-schedule.ll =================================================================== --- test/CodeGen/X86/bmi-schedule.ll +++ test/CodeGen/X86/bmi-schedule.ll @@ -20,10 +20,10 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50] ; HASWELL-NEXT: notl %edi # sched: [1:0.25] -; HASWELL-NEXT: andw (%rdx), %di # sched: [5:0.50] +; HASWELL-NEXT: andw (%rdx), %di # sched: [1:0.50] ; HASWELL-NEXT: addl %edi, %eax # sched: [1:0.25] ; HASWELL-NEXT: # kill: %AX %AX %EAX -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andn_i16: ; BTVER2: # BB#0: @@ -61,9 +61,9 @@ ; HASWELL-LABEL: test_andn_i32: ; HASWELL: # BB#0: ; HASWELL-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50] -; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [4:0.50] +; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [1:0.50] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andn_i32: ; BTVER2: # BB#0: @@ -97,9 +97,9 @@ ; HASWELL-LABEL: test_andn_i64: ; HASWELL: # BB#0: ; HASWELL-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50] -; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:0.50] +; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [1:0.50] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andn_i64: ; BTVER2: # BB#0: @@ -132,10 +132,10 @@ ; ; HASWELL-LABEL: test_bextr_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:0.50] +; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [2:0.50] ; HASWELL-NEXT: bextrl %edi, %esi, %eax # sched: [2:0.50] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_bextr_i32: ; BTVER2: # BB#0: @@ -168,10 +168,10 @@ ; ; HASWELL-LABEL: test_bextr_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:0.50] +; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [2:0.50] ; HASWELL-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:0.50] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_bextr_i64: ; BTVER2: # BB#0: @@ -204,10 +204,10 @@ ; ; HASWELL-LABEL: test_blsi_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [1:0.50] ; HASWELL-NEXT: blsil %edi, %eax # sched: [1:0.50] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blsi_i32: ; BTVER2: # BB#0: @@ -241,10 +241,10 @@ ; ; HASWELL-LABEL: test_blsi_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [1:0.50] ; HASWELL-NEXT: blsiq %rdi, %rax # sched: [1:0.50] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blsi_i64: ; BTVER2: # BB#0: @@ -278,10 +278,10 @@ ; ; HASWELL-LABEL: test_blsmsk_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [1:0.50] ; HASWELL-NEXT: blsmskl %edi, %eax # sched: [1:0.50] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blsmsk_i32: ; BTVER2: # BB#0: @@ -315,10 +315,10 @@ ; ; HASWELL-LABEL: test_blsmsk_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [1:0.50] ; HASWELL-NEXT: blsmskq %rdi, %rax # sched: [1:0.50] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blsmsk_i64: ; BTVER2: # BB#0: @@ -352,10 +352,10 @@ ; ; HASWELL-LABEL: test_blsr_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [4:0.50] +; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [1:0.50] ; HASWELL-NEXT: blsrl %edi, %eax # sched: [1:0.50] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blsr_i32: ; BTVER2: # BB#0: @@ -389,10 +389,10 @@ ; ; HASWELL-LABEL: test_blsr_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [4:0.50] +; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [1:0.50] ; HASWELL-NEXT: blsrq %rdi, %rax # sched: [1:0.50] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blsr_i64: ; BTVER2: # BB#0: @@ -427,11 +427,11 @@ ; ; HASWELL-LABEL: test_cttz_i16: ; HASWELL: # BB#0: -; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [7:1.00] +; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [3:1.00] ; HASWELL-NEXT: tzcntw %di, %ax # sched: [3:1.00] ; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] ; HASWELL-NEXT: # kill: %AX %AX %EAX -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cttz_i16: ; BTVER2: # BB#0: @@ -466,10 +466,10 @@ ; ; HASWELL-LABEL: test_cttz_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [7:1.00] +; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [3:1.00] ; HASWELL-NEXT: tzcntl %edi, %eax # sched: [3:1.00] ; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cttz_i32: ; BTVER2: # BB#0: @@ -502,10 +502,10 @@ ; ; HASWELL-LABEL: test_cttz_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [7:1.00] +; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [3:1.00] ; HASWELL-NEXT: tzcntq %rdi, %rax # sched: [3:1.00] ; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cttz_i64: ; BTVER2: # BB#0: Index: test/CodeGen/X86/bmi2-schedule.ll =================================================================== --- test/CodeGen/X86/bmi2-schedule.ll +++ test/CodeGen/X86/bmi2-schedule.ll @@ -15,10 +15,10 @@ ; ; HASWELL-LABEL: test_bzhi_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [4:0.50] +; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [1:0.50] ; HASWELL-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_bzhi_i32: ; ZNVER1: # BB#0: @@ -44,10 +44,10 @@ ; ; HASWELL-LABEL: test_bzhi_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50] +; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [1:0.50] ; HASWELL-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_bzhi_i64: ; ZNVER1: # BB#0: @@ -73,10 +73,10 @@ ; ; HASWELL-LABEL: test_pdep_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [7:1.00] +; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [3:1.00] ; HASWELL-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pdep_i32: ; ZNVER1: # BB#0: @@ -102,10 +102,10 @@ ; ; HASWELL-LABEL: test_pdep_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [7:1.00] +; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [3:1.00] ; HASWELL-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pdep_i64: ; ZNVER1: # BB#0: @@ -131,10 +131,10 @@ ; ; HASWELL-LABEL: test_pext_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [7:1.00] +; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [3:1.00] ; HASWELL-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pext_i32: ; ZNVER1: # BB#0: @@ -160,10 +160,10 @@ ; ; HASWELL-LABEL: test_pext_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [7:1.00] +; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [3:1.00] ; HASWELL-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; ZNVER1-LABEL: test_pext_i64: ; ZNVER1: # BB#0: Index: test/CodeGen/X86/f16c-schedule.ll =================================================================== --- test/CodeGen/X86/f16c-schedule.ll +++ test/CodeGen/X86/f16c-schedule.ll @@ -23,10 +23,10 @@ ; ; HASWELL-LABEL: test_vcvtph2ps_128: ; HASWELL: # BB#0: -; HASWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00] -; HASWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [1:1.00] +; HASWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_vcvtph2ps_128: ; BTVER2: # BB#0: @@ -66,10 +66,10 @@ ; ; HASWELL-LABEL: test_vcvtph2ps_256: ; HASWELL: # BB#0: -; HASWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00] -; HASWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [1:1.00] +; HASWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [2:1.00] ; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_vcvtph2ps_256: ; BTVER2: # BB#0: @@ -108,8 +108,8 @@ ; HASWELL-LABEL: test_vcvtps2ph_128: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_vcvtps2ph_128: ; BTVER2: # BB#0: @@ -147,10 +147,10 @@ ; ; HASWELL-LABEL: test_vcvtps2ph_256: ; HASWELL: # BB#0: -; HASWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00] -; HASWELL-NEXT: vzeroupper # sched: [1:?] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00] +; HASWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [6:1.00] +; HASWELL-NEXT: vzeroupper # sched: [4:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_vcvtps2ph_256: ; BTVER2: # BB#0: Index: test/CodeGen/X86/lea32-schedule.ll =================================================================== --- test/CodeGen/X86/lea32-schedule.ll +++ test/CodeGen/X86/lea32-schedule.ll @@ -45,7 +45,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal -24(%rdi), %eax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_offset: ; BTVER2: # BB#0: @@ -97,7 +97,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_offset_big: ; BTVER2: # BB#0: @@ -155,7 +155,7 @@ ; HASWELL-NEXT: # kill: %ESI %ESI %RSI ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add: ; BTVER2: # BB#0: @@ -216,7 +216,7 @@ ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] ; HASWELL-NEXT: addl $16, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_offset: ; BTVER2: # BB#0: @@ -280,7 +280,7 @@ ; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50] ; HASWELL-NEXT: addl $-4096, %eax # imm = 0xF000 ; HASWELL-NEXT: # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_offset_big: ; BTVER2: # BB#0: @@ -335,7 +335,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_mul: ; BTVER2: # BB#0: @@ -389,7 +389,7 @@ ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] ; HASWELL-NEXT: addl $-32, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_mul_offset: ; BTVER2: # BB#0: @@ -446,7 +446,7 @@ ; HASWELL-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; HASWELL-NEXT: addl $10000, %eax # imm = 0x2710 ; HASWELL-NEXT: # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_mul_offset_big: ; BTVER2: # BB#0: @@ -504,7 +504,7 @@ ; HASWELL-NEXT: # kill: %ESI %ESI %RSI ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_scale: ; BTVER2: # BB#0: @@ -566,7 +566,7 @@ ; HASWELL-NEXT: # kill: %EDI %EDI %RDI ; HASWELL-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50] ; HASWELL-NEXT: addl $96, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_scale_offset: ; BTVER2: # BB#0: @@ -631,7 +631,7 @@ ; HASWELL-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50] ; HASWELL-NEXT: addl $-1200, %eax # imm = 0xFB50 ; HASWELL-NEXT: # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_scale_offset_big: ; BTVER2: # BB#0: Index: test/CodeGen/X86/lea64-schedule.ll =================================================================== --- test/CodeGen/X86/lea64-schedule.ll +++ test/CodeGen/X86/lea64-schedule.ll @@ -40,7 +40,7 @@ ; HASWELL-LABEL: test_lea_offset: ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_offset: ; BTVER2: # BB#0: @@ -85,7 +85,7 @@ ; HASWELL-LABEL: test_lea_offset_big: ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_offset_big: ; BTVER2: # BB#0: @@ -131,7 +131,7 @@ ; HASWELL-LABEL: test_lea_add: ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add: ; BTVER2: # BB#0: @@ -178,7 +178,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] ; HASWELL-NEXT: addq $16, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_offset: ; BTVER2: # BB#0: @@ -228,7 +228,7 @@ ; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50] ; HASWELL-NEXT: addq $-4096, %rax # imm = 0xF000 ; HASWELL-NEXT: # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_offset_big: ; BTVER2: # BB#0: @@ -274,7 +274,7 @@ ; HASWELL-LABEL: test_lea_mul: ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_mul: ; BTVER2: # BB#0: @@ -321,7 +321,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] ; HASWELL-NEXT: addq $-32, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_mul_offset: ; BTVER2: # BB#0: @@ -371,7 +371,7 @@ ; HASWELL-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; HASWELL-NEXT: addq $10000, %rax # imm = 0x2710 ; HASWELL-NEXT: # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_mul_offset_big: ; BTVER2: # BB#0: @@ -417,7 +417,7 @@ ; HASWELL-LABEL: test_lea_add_scale: ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_scale: ; BTVER2: # BB#0: @@ -465,7 +465,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50] ; HASWELL-NEXT: addq $96, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_scale_offset: ; BTVER2: # BB#0: @@ -516,7 +516,7 @@ ; HASWELL-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50] ; HASWELL-NEXT: addq $-1200, %rax # imm = 0xFB50 ; HASWELL-NEXT: # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lea_add_scale_offset_big: ; BTVER2: # BB#0: Index: test/CodeGen/X86/lzcnt-schedule.ll =================================================================== --- test/CodeGen/X86/lzcnt-schedule.ll +++ test/CodeGen/X86/lzcnt-schedule.ll @@ -17,11 +17,11 @@ ; ; HASWELL-LABEL: test_ctlz_i16: ; HASWELL: # BB#0: -; HASWELL-NEXT: lzcntw (%rsi), %cx -; HASWELL-NEXT: lzcntw %di, %ax +; HASWELL-NEXT: lzcntw (%rsi), %cx # sched: [3:1.00] +; HASWELL-NEXT: lzcntw %di, %ax # sched: [3:1.00] ; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] ; HASWELL-NEXT: # kill: %AX %AX %EAX -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ctlz_i16: ; BTVER2: # BB#0: @@ -56,10 +56,10 @@ ; ; HASWELL-LABEL: test_ctlz_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: lzcntl (%rsi), %ecx -; HASWELL-NEXT: lzcntl %edi, %eax +; HASWELL-NEXT: lzcntl (%rsi), %ecx # sched: [3:1.00] +; HASWELL-NEXT: lzcntl %edi, %eax # sched: [3:1.00] ; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ctlz_i32: ; BTVER2: # BB#0: @@ -92,10 +92,10 @@ ; ; HASWELL-LABEL: test_ctlz_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: lzcntq (%rsi), %rcx -; HASWELL-NEXT: lzcntq %rdi, %rax +; HASWELL-NEXT: lzcntq (%rsi), %rcx # sched: [3:1.00] +; HASWELL-NEXT: lzcntq %rdi, %rax # sched: [3:1.00] ; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ctlz_i64: ; BTVER2: # BB#0: Index: test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- test/CodeGen/X86/mul-constant-i32.ll +++ test/CodeGen/X86/mul-constant-i32.ll @@ -17,7 +17,7 @@ ; X64-HSW-LABEL: test_mul_by_1: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_1: ; X64-JAG: # BB#0: @@ -32,7 +32,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_1: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_1: ; JAG-NOOPT: # BB#0: @@ -63,7 +63,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_2: ; X64-JAG: # BB#0: @@ -81,7 +81,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI ; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_2: ; JAG-NOOPT: # BB#0: @@ -114,7 +114,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_3: ; X64-JAG: # BB#0: @@ -131,7 +131,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI ; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_3: ; JAG-NOOPT: # BB#0: @@ -165,7 +165,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_4: ; X64-JAG: # BB#0: @@ -183,7 +183,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI ; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_4: ; JAG-NOOPT: # BB#0: @@ -216,7 +216,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_5: ; X64-JAG: # BB#0: @@ -233,7 +233,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI ; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_5: ; JAG-NOOPT: # BB#0: @@ -269,7 +269,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] ; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_6: ; X64-JAG: # BB#0: @@ -285,8 +285,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_6: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_6: ; JAG-NOOPT: # BB#0: @@ -321,7 +321,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_7: ; X64-JAG: # BB#0: @@ -337,8 +337,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_7: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_7: ; JAG-NOOPT: # BB#0: @@ -371,7 +371,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_8: ; X64-JAG: # BB#0: @@ -389,7 +389,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI ; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_8: ; JAG-NOOPT: # BB#0: @@ -422,7 +422,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_9: ; X64-JAG: # BB#0: @@ -439,7 +439,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: # kill: %EDI %EDI %RDI ; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_9: ; JAG-NOOPT: # BB#0: @@ -475,7 +475,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_10: ; X64-JAG: # BB#0: @@ -491,8 +491,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_10: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_10: ; JAG-NOOPT: # BB#0: @@ -527,7 +527,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_11: ; X64-JAG: # BB#0: @@ -543,8 +543,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_11: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_11: ; JAG-NOOPT: # BB#0: @@ -577,7 +577,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_12: ; X64-JAG: # BB#0: @@ -593,8 +593,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_12: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_12: ; JAG-NOOPT: # BB#0: @@ -629,7 +629,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_13: ; X64-JAG: # BB#0: @@ -645,8 +645,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_13: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_13: ; JAG-NOOPT: # BB#0: @@ -681,7 +681,7 @@ ; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_14: ; X64-JAG: # BB#0: @@ -698,8 +698,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_14: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_14: ; JAG-NOOPT: # BB#0: @@ -732,7 +732,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_15: ; X64-JAG: # BB#0: @@ -748,8 +748,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_15: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_15: ; JAG-NOOPT: # BB#0: @@ -782,7 +782,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50] ; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_16: ; X64-JAG: # BB#0: @@ -800,7 +800,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50] ; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_16: ; JAG-NOOPT: # BB#0: @@ -838,7 +838,7 @@ ; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_17: ; X64-JAG: # BB#0: @@ -855,8 +855,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_17: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_17: ; JAG-NOOPT: # BB#0: @@ -892,7 +892,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_18: ; X64-JAG: # BB#0: @@ -908,8 +908,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_18: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_18: ; JAG-NOOPT: # BB#0: @@ -946,7 +946,7 @@ ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50] ; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_19: ; X64-JAG: # BB#0: @@ -963,8 +963,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_19: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_19: ; JAG-NOOPT: # BB#0: @@ -997,7 +997,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_20: ; X64-JAG: # BB#0: @@ -1013,8 +1013,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_20: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_20: ; JAG-NOOPT: # BB#0: @@ -1049,7 +1049,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_21: ; X64-JAG: # BB#0: @@ -1065,8 +1065,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_21: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_21: ; JAG-NOOPT: # BB#0: @@ -1101,7 +1101,7 @@ ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_22: ; X64-JAG: # BB#0: @@ -1118,8 +1118,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_22: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_22: ; JAG-NOOPT: # BB#0: @@ -1154,7 +1154,7 @@ ; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50] ; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_23: ; X64-JAG: # BB#0: @@ -1171,8 +1171,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_23: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_23: ; JAG-NOOPT: # BB#0: @@ -1205,7 +1205,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_24: ; X64-JAG: # BB#0: @@ -1221,8 +1221,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_24: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_24: ; JAG-NOOPT: # BB#0: @@ -1257,7 +1257,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_25: ; X64-JAG: # BB#0: @@ -1273,8 +1273,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_25: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_25: ; JAG-NOOPT: # BB#0: @@ -1311,7 +1311,7 @@ ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_26: ; X64-JAG: # BB#0: @@ -1328,8 +1328,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_26: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_26: ; JAG-NOOPT: # BB#0: @@ -1362,7 +1362,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_27: ; X64-JAG: # BB#0: @@ -1378,8 +1378,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_27: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_27: ; JAG-NOOPT: # BB#0: @@ -1416,7 +1416,7 @@ ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_28: ; X64-JAG: # BB#0: @@ -1433,8 +1433,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_28: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_28: ; JAG-NOOPT: # BB#0: @@ -1471,7 +1471,7 @@ ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: ; X64-JAG: # BB#0: @@ -1489,8 +1489,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_29: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_29: ; JAG-NOOPT: # BB#0: @@ -1526,7 +1526,7 @@ ; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] ; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_30: ; X64-JAG: # BB#0: @@ -1543,8 +1543,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_30: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_30: ; JAG-NOOPT: # BB#0: @@ -1578,7 +1578,7 @@ ; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] ; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] ; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_31: ; X64-JAG: # BB#0: @@ -1594,8 +1594,8 @@ ; ; HSW-NOOPT-LABEL: test_mul_by_31: ; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_31: ; JAG-NOOPT: # BB#0: @@ -1628,7 +1628,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50] ; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_32: ; X64-JAG: # BB#0: @@ -1646,7 +1646,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50] ; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_32: ; JAG-NOOPT: # BB#0: @@ -1687,7 +1687,7 @@ ; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25] ; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_spec: ; X64-JAG: # BB#0: @@ -1713,7 +1713,7 @@ ; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] ; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25] ; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_spec: ; JAG-NOOPT: # BB#0: Index: test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- test/CodeGen/X86/mul-constant-i64.ll +++ test/CodeGen/X86/mul-constant-i64.ll @@ -18,7 +18,7 @@ ; X64-HSW-LABEL: test_mul_by_1: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_1: ; X64-JAG: # BB#0: @@ -34,7 +34,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_1: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_1: ; JAG-NOOPT: # BB#0: @@ -66,7 +66,7 @@ ; X64-HSW-LABEL: test_mul_by_2: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_2: ; X64-JAG: # BB#0: @@ -84,7 +84,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_2: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_2: ; JAG-NOOPT: # BB#0: @@ -116,7 +116,7 @@ ; X64-HSW-LABEL: test_mul_by_3: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_3: ; X64-JAG: # BB#0: @@ -134,7 +134,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_3: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_3: ; JAG-NOOPT: # BB#0: @@ -166,7 +166,7 @@ ; X64-HSW-LABEL: test_mul_by_4: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_4: ; X64-JAG: # BB#0: @@ -184,7 +184,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_4: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_4: ; JAG-NOOPT: # BB#0: @@ -216,7 +216,7 @@ ; X64-HSW-LABEL: test_mul_by_5: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_5: ; X64-JAG: # BB#0: @@ -234,7 +234,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_5: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_5: ; JAG-NOOPT: # BB#0: @@ -268,7 +268,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] ; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_6: ; X64-JAG: # BB#0: @@ -287,7 +287,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_6: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_6: ; JAG-NOOPT: # BB#0: @@ -323,7 +323,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] ; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_7: ; X64-JAG: # BB#0: @@ -342,7 +342,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_7: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_7: ; JAG-NOOPT: # BB#0: @@ -375,7 +375,7 @@ ; X64-HSW-LABEL: test_mul_by_8: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_8: ; X64-JAG: # BB#0: @@ -393,7 +393,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_8: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_8: ; JAG-NOOPT: # BB#0: @@ -425,7 +425,7 @@ ; X64-HSW-LABEL: test_mul_by_9: ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_9: ; X64-JAG: # BB#0: @@ -443,7 +443,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_9: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_9: ; JAG-NOOPT: # BB#0: @@ -477,7 +477,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_10: ; X64-JAG: # BB#0: @@ -496,7 +496,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_10: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_10: ; JAG-NOOPT: # BB#0: @@ -532,7 +532,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_11: ; X64-JAG: # BB#0: @@ -551,7 +551,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_11: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_11: ; JAG-NOOPT: # BB#0: @@ -585,7 +585,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_12: ; X64-JAG: # BB#0: @@ -604,7 +604,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_12: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_12: ; JAG-NOOPT: # BB#0: @@ -640,7 +640,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_13: ; X64-JAG: # BB#0: @@ -659,7 +659,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_13: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_13: ; JAG-NOOPT: # BB#0: @@ -696,7 +696,7 @@ ; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_14: ; X64-JAG: # BB#0: @@ -716,7 +716,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_14: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_14: ; JAG-NOOPT: # BB#0: @@ -751,7 +751,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_15: ; X64-JAG: # BB#0: @@ -770,7 +770,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_15: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_15: ; JAG-NOOPT: # BB#0: @@ -804,7 +804,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50] ; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_16: ; X64-JAG: # BB#0: @@ -824,7 +824,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50] ; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_16: ; JAG-NOOPT: # BB#0: @@ -864,7 +864,7 @@ ; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_17: ; X64-JAG: # BB#0: @@ -884,7 +884,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_17: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_17: ; JAG-NOOPT: # BB#0: @@ -920,7 +920,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_18: ; X64-JAG: # BB#0: @@ -939,7 +939,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_18: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_18: ; JAG-NOOPT: # BB#0: @@ -977,7 +977,7 @@ ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50] ; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_19: ; X64-JAG: # BB#0: @@ -997,7 +997,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_19: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_19: ; JAG-NOOPT: # BB#0: @@ -1031,7 +1031,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_20: ; X64-JAG: # BB#0: @@ -1050,7 +1050,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_20: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_20: ; JAG-NOOPT: # BB#0: @@ -1086,7 +1086,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_21: ; X64-JAG: # BB#0: @@ -1105,7 +1105,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_21: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_21: ; JAG-NOOPT: # BB#0: @@ -1142,7 +1142,7 @@ ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_22: ; X64-JAG: # BB#0: @@ -1162,7 +1162,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_22: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_22: ; JAG-NOOPT: # BB#0: @@ -1199,7 +1199,7 @@ ; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50] ; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_23: ; X64-JAG: # BB#0: @@ -1219,7 +1219,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_23: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_23: ; JAG-NOOPT: # BB#0: @@ -1253,7 +1253,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_24: ; X64-JAG: # BB#0: @@ -1272,7 +1272,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_24: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_24: ; JAG-NOOPT: # BB#0: @@ -1308,7 +1308,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_25: ; X64-JAG: # BB#0: @@ -1327,7 +1327,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_25: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_25: ; JAG-NOOPT: # BB#0: @@ -1365,7 +1365,7 @@ ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_26: ; X64-JAG: # BB#0: @@ -1385,7 +1385,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_26: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_26: ; JAG-NOOPT: # BB#0: @@ -1420,7 +1420,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_27: ; X64-JAG: # BB#0: @@ -1439,7 +1439,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_27: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_27: ; JAG-NOOPT: # BB#0: @@ -1477,7 +1477,7 @@ ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_28: ; X64-JAG: # BB#0: @@ -1497,7 +1497,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_28: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_28: ; JAG-NOOPT: # BB#0: @@ -1536,7 +1536,7 @@ ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: ; X64-JAG: # BB#0: @@ -1557,7 +1557,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_29: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_29: ; JAG-NOOPT: # BB#0: @@ -1596,7 +1596,7 @@ ; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] ; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_30: ; X64-JAG: # BB#0: @@ -1617,7 +1617,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_30: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_30: ; JAG-NOOPT: # BB#0: @@ -1654,7 +1654,7 @@ ; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] ; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] ; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_31: ; X64-JAG: # BB#0: @@ -1674,7 +1674,7 @@ ; HSW-NOOPT-LABEL: test_mul_by_31: ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_31: ; JAG-NOOPT: # BB#0: @@ -1709,7 +1709,7 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50] ; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_32: ; X64-JAG: # BB#0: @@ -1729,7 +1729,7 @@ ; HSW-NOOPT: # BB#0: ; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50] ; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_by_32: ; JAG-NOOPT: # BB#0: @@ -1792,8 +1792,8 @@ ; X64-HSW-NEXT: addq $42, %rcx # sched: [1:0.25] ; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25] -; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] +; X64-HSW-NEXT: imulq %rcx, %rax # sched: [4:1.00] +; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_spec: ; X64-JAG: # BB#0: @@ -1840,8 +1840,8 @@ ; HSW-NOOPT-NEXT: addq $42, %rcx # sched: [1:0.25] ; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] ; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] +; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [4:1.00] +; HSW-NOOPT-NEXT: retq # sched: [2:1.00] ; ; JAG-NOOPT-LABEL: test_mul_spec: ; JAG-NOOPT: # BB#0: Index: test/CodeGen/X86/popcnt-schedule.ll =================================================================== --- test/CodeGen/X86/popcnt-schedule.ll +++ test/CodeGen/X86/popcnt-schedule.ll @@ -37,11 +37,11 @@ ; ; HASWELL-LABEL: test_ctpop_i16: ; HASWELL: # BB#0: -; HASWELL-NEXT: popcntw (%rsi), %cx # sched: [7:1.00] +; HASWELL-NEXT: popcntw (%rsi), %cx # sched: [3:1.00] ; HASWELL-NEXT: popcntw %di, %ax # sched: [3:1.00] ; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] ; HASWELL-NEXT: # kill: %AX %AX %EAX -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ctpop_i16: ; BTVER2: # BB#0: @@ -90,10 +90,10 @@ ; ; HASWELL-LABEL: test_ctpop_i32: ; HASWELL: # BB#0: -; HASWELL-NEXT: popcntl (%rsi), %ecx # sched: [7:1.00] +; HASWELL-NEXT: popcntl (%rsi), %ecx # sched: [3:1.00] ; HASWELL-NEXT: popcntl %edi, %eax # sched: [3:1.00] ; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ctpop_i32: ; BTVER2: # BB#0: @@ -140,10 +140,10 @@ ; ; HASWELL-LABEL: test_ctpop_i64: ; HASWELL: # BB#0: -; HASWELL-NEXT: popcntq (%rsi), %rcx # sched: [7:1.00] +; HASWELL-NEXT: popcntq (%rsi), %rcx # sched: [3:1.00] ; HASWELL-NEXT: popcntq %rdi, %rax # sched: [3:1.00] ; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ctpop_i64: ; BTVER2: # BB#0: Index: test/CodeGen/X86/pr32329.ll =================================================================== --- test/CodeGen/X86/pr32329.ll +++ test/CodeGen/X86/pr32329.ll @@ -36,33 +36,33 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .Lcfi7: ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl obj, %edx ; X86-NEXT: movsbl var_27, %eax -; X86-NEXT: movzwl var_2, %esi ; X86-NEXT: movl var_310, %ecx ; X86-NEXT: imull %eax, %ecx +; X86-NEXT: movl obj, %esi ; X86-NEXT: addl var_24, %ecx -; X86-NEXT: andl $4194303, %edx # imm = 0x3FFFFF -; X86-NEXT: leal (%edx,%edx), %ebx -; X86-NEXT: subl %eax, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: subl %esi, %edi -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movzwl var_2, %edi +; X86-NEXT: andl $4194303, %esi # imm = 0x3FFFFF +; X86-NEXT: leal (%esi,%esi), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 -; X86-NEXT: movl $9, %esi +; X86-NEXT: movl $9, %edi ; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: shldl %cl, %esi, %ebp -; X86-NEXT: shlxl %ecx, %esi, %esi +; X86-NEXT: shldl %cl, %edi, %ebp +; X86-NEXT: shlxl %ecx, %edi, %edi ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: cmovnel %edi, %ebp ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovnel %ecx, %esi -; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: cmovnel %ecx, %edi ; X86-NEXT: movl %ebp, var_50+4 -; X86-NEXT: movl %esi, var_50 +; X86-NEXT: cmpl %esi, %ebx ; X86-NEXT: setge var_205 -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: movb %bl, var_218 +; X86-NEXT: imull %eax, %edx +; X86-NEXT: movl %edi, var_50 +; X86-NEXT: movb %dl, var_218 ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -71,25 +71,25 @@ ; ; X64-LABEL: foo: ; X64: # BB#0: # %entry -; X64-NEXT: movl {{.*}}(%rip), %eax -; X64-NEXT: movsbl {{.*}}(%rip), %r9d +; X64-NEXT: movsbl {{.*}}(%rip), %eax +; X64-NEXT: movl {{.*}}(%rip), %ecx +; X64-NEXT: imull %eax, %ecx +; X64-NEXT: movl {{.*}}(%rip), %edx +; X64-NEXT: addl {{.*}}(%rip), %ecx ; X64-NEXT: movzwl {{.*}}(%rip), %r8d -; X64-NEXT: movl {{.*}}(%rip), %esi -; X64-NEXT: imull %r9d, %esi -; X64-NEXT: addl {{.*}}(%rip), %esi -; X64-NEXT: andl $4194303, %eax # imm = 0x3FFFFF -; X64-NEXT: leal (%rax,%rax), %edi -; X64-NEXT: subl %r9d, %edi -; X64-NEXT: movl %edi, %edx -; X64-NEXT: subl %r8d, %edx -; X64-NEXT: imull %edx, %esi -; X64-NEXT: addl $-1437483407, %esi # imm = 0xAA51BE71 -; X64-NEXT: movl $9, %ecx -; X64-NEXT: shlxq %rsi, %rcx, %rcx -; X64-NEXT: movq %rcx, {{.*}}(%rip) -; X64-NEXT: cmpl %eax, %edx +; X64-NEXT: andl $4194303, %edx # imm = 0x3FFFFF +; X64-NEXT: leal (%rdx,%rdx), %edi +; X64-NEXT: subl %eax, %edi +; X64-NEXT: movl %edi, %esi +; X64-NEXT: subl %r8d, %esi +; X64-NEXT: imull %esi, %ecx +; X64-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71 +; X64-NEXT: movl $9, %r8d +; X64-NEXT: cmpl %edx, %esi ; X64-NEXT: setge {{.*}}(%rip) -; X64-NEXT: imull %r9d, %edi +; X64-NEXT: shlxq %rcx, %r8, %rcx +; X64-NEXT: imull %eax, %edi +; X64-NEXT: movq %rcx, {{.*}}(%rip) ; X64-NEXT: movb %dil, {{.*}}(%rip) ; X64-NEXT: retq entry: Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -51,9 +51,9 @@ ; ; HASWELL-LABEL: f32_no_estimate: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] -; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] +; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_no_estimate: ; HASWELL-NO-FMA: # BB#0: @@ -63,9 +63,9 @@ ; ; AVX512-LABEL: f32_no_estimate: ; AVX512: # BB#0: -; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] -; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00] -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] +; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -122,9 +122,9 @@ ; HASWELL-LABEL: f32_one_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_one_step: ; HASWELL-NO-FMA: # BB#0: @@ -139,9 +139,9 @@ ; AVX512-LABEL: f32_one_step: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -220,13 +220,13 @@ ; HASWELL-LABEL: f32_two_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_two_step: ; HASWELL-NO-FMA: # BB#0: @@ -245,13 +245,13 @@ ; AVX512-LABEL: f32_two_step: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1.0, %x ret float %div } @@ -290,9 +290,9 @@ ; ; HASWELL-LABEL: v4f32_no_estimate: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50] -; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate: ; HASWELL-NO-FMA: # BB#0: @@ -302,9 +302,9 @@ ; ; AVX512-LABEL: v4f32_no_estimate: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50] -; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50] +; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -361,10 +361,10 @@ ; HASWELL-LABEL: v4f32_one_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step: ; HASWELL-NO-FMA: # BB#0: @@ -379,17 +379,17 @@ ; KNL-LABEL: v4f32_one_step: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v4f32_one_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -468,13 +468,13 @@ ; HASWELL-LABEL: v4f32_two_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step: ; HASWELL-NO-FMA: # BB#0: @@ -493,24 +493,24 @@ ; KNL-LABEL: v4f32_two_step: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v4f32_two_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -552,9 +552,9 @@ ; ; HASWELL-LABEL: v8f32_no_estimate: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate: ; HASWELL-NO-FMA: # BB#0: @@ -564,9 +564,9 @@ ; ; AVX512-LABEL: v8f32_no_estimate: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00] -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -629,11 +629,11 @@ ; ; HASWELL-LABEL: v8f32_one_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step: ; HASWELL-NO-FMA: # BB#0: @@ -647,18 +647,18 @@ ; ; KNL-LABEL: v8f32_one_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v8f32_one_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -749,14 +749,14 @@ ; ; HASWELL-LABEL: v8f32_two_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step: ; HASWELL-NO-FMA: # BB#0: @@ -774,25 +774,25 @@ ; ; KNL-LABEL: v8f32_two_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v8f32_two_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -45,20 +45,20 @@ ; HASWELL-LABEL: f32_no_step_2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_no_step_2: ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; AVX512-LABEL: f32_no_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 1234.0, %x ret float %div } @@ -120,29 +120,29 @@ ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2: ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; AVX512-LABEL: f32_one_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 3456.0, %x ret float %div } @@ -209,32 +209,32 @@ ; HASWELL-LABEL: f32_one_step_2_divs: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] ; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; AVX512-LABEL: f32_one_step_2_divs: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 3456.0, %x %div2 = fdiv fast float %div, %x ret float %div2 @@ -319,20 +319,20 @@ ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: f32_two_step_2: ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] @@ -340,20 +340,20 @@ ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; AVX512-LABEL: f32_two_step_2: ; AVX512: # BB#0: ; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 -; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 -; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 -; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; AVX512-NEXT: retq # sched: [1:1.00] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; AVX512-NEXT: retq # sched: [2:1.00] %div = fdiv fast float 6789.0, %x ret float %div } @@ -415,39 +415,39 @@ ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v4f32_one_step2: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v4f32_one_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -514,43 +514,43 @@ ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] ; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v4f32_one_step_2_divs: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] ; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v4f32_one_step_2_divs: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50] ; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x %div2 = fdiv fast <4 x float> %div, %x ret <4 x float> %div2 @@ -635,20 +635,20 @@ ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] @@ -656,32 +656,32 @@ ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v4f32_two_step2: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v4f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] +; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 -; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 -; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 -; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -750,40 +750,40 @@ ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v8f32_one_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v8f32_one_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -858,44 +858,44 @@ ; ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] -; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50] +; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v8f32_one_step_2_divs: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] -; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50] +; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v8f32_one_step_2_divs: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] -; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [5:0.50] +; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x %div2 = fdiv fast <8 x float> %div, %x ret <8 x float> %div2 @@ -993,54 +993,54 @@ ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] +; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v8f32_two_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] +; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v8f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] +; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 -; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 -; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 -; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50] +; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -1074,23 +1074,23 @@ ; ; HASWELL-LABEL: v8f32_no_step: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v8f32_no_step: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v8f32_no_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -1130,27 +1130,27 @@ ; ; HASWELL-LABEL: v8f32_no_step2: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_no_step2: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] -; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00] +; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] +; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; HASWELL-NO-FMA-NEXT: retq # sched: [2:1.00] ; ; KNL-LABEL: v8f32_no_step2: ; KNL: # BB#0: -; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; KNL-NEXT: retq # sched: [1:1.00] +; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; KNL-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: v8f32_no_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 -; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; SKX-NEXT: retq # sched: [1:1.00] +; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [5:0.50] +; SKX-NEXT: retq # sched: [2:1.00] %div = fdiv fast <8 x float> , %x ret <8 x float> %div } Index: test/CodeGen/X86/sse-schedule.ll =================================================================== --- test/CodeGen/X86/sse-schedule.ll +++ test/CodeGen/X86/sse-schedule.ll @@ -37,8 +37,8 @@ ; HASWELL-LABEL: test_addps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addps: ; BTVER2: # BB#0: @@ -85,8 +85,8 @@ ; HASWELL-LABEL: test_addss: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addss: ; BTVER2: # BB#0: @@ -137,8 +137,8 @@ ; HASWELL-LABEL: test_andps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andps: ; BTVER2: # BB#0: @@ -193,8 +193,8 @@ ; HASWELL-LABEL: test_andnotps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotps: ; BTVER2: # BB#0: @@ -251,9 +251,9 @@ ; HASWELL-LABEL: test_cmpps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmpps: ; BTVER2: # BB#0: @@ -306,7 +306,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmpss: ; BTVER2: # BB#0: @@ -399,7 +399,7 @@ ; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] ; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] ; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_comiss: ; BTVER2: # BB#0: @@ -470,7 +470,7 @@ ; HASWELL-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00] ; HASWELL-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2ss: ; BTVER2: # BB#0: @@ -523,10 +523,10 @@ ; ; HASWELL-LABEL: test_cvtsi2ssq: ; HASWELL: # BB#0: -; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00] ; HASWELL-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2ssq: ; BTVER2: # BB#0: @@ -580,9 +580,9 @@ ; HASWELL-LABEL: test_cvtss2si: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtss2si %xmm0, %ecx # sched: [4:1.00] -; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [4:1.00] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtss2si: ; BTVER2: # BB#0: @@ -639,9 +639,9 @@ ; HASWELL-LABEL: test_cvtss2siq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtss2si %xmm0, %rcx # sched: [4:1.00] -; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [4:1.00] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtss2siq: ; BTVER2: # BB#0: @@ -698,9 +698,9 @@ ; HASWELL-LABEL: test_cvttss2si: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttss2si %xmm0, %ecx # sched: [4:1.00] -; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [4:1.00] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttss2si: ; BTVER2: # BB#0: @@ -754,9 +754,9 @@ ; HASWELL-LABEL: test_cvttss2siq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttss2si %xmm0, %rcx # sched: [4:1.00] -; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [4:1.00] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttss2siq: ; BTVER2: # BB#0: @@ -805,9 +805,9 @@ ; ; HASWELL-LABEL: test_divps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [13:1.00] +; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [13:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divps: ; BTVER2: # BB#0: @@ -853,9 +853,9 @@ ; ; HASWELL-LABEL: test_divss: ; HASWELL: # BB#0: -; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [13:1.00] +; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [13:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divss: ; BTVER2: # BB#0: @@ -902,8 +902,8 @@ ; HASWELL-LABEL: test_ldmxcsr: ; HASWELL: # BB#0: ; HASWELL-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] -; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [6:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ldmxcsr: ; BTVER2: # BB#0: @@ -952,8 +952,8 @@ ; HASWELL-LABEL: test_maxps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxps: ; BTVER2: # BB#0: @@ -1001,8 +1001,8 @@ ; HASWELL-LABEL: test_maxss: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxss: ; BTVER2: # BB#0: @@ -1050,8 +1050,8 @@ ; HASWELL-LABEL: test_minps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minps: ; BTVER2: # BB#0: @@ -1099,8 +1099,8 @@ ; HASWELL-LABEL: test_minss: ; HASWELL: # BB#0: ; HASWELL-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minss: ; BTVER2: # BB#0: @@ -1151,10 +1151,10 @@ ; ; HASWELL-LABEL: test_movaps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movaps: ; BTVER2: # BB#0: @@ -1207,7 +1207,7 @@ ; HASWELL-LABEL: test_movhlps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movhlps: ; BTVER2: # BB#0: @@ -1257,10 +1257,10 @@ ; ; HASWELL-LABEL: test_movhps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movhps: ; BTVER2: # BB#0: @@ -1316,7 +1316,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] ; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movlhps: ; BTVER2: # BB#0: @@ -1365,10 +1365,10 @@ ; ; HASWELL-LABEL: test_movlps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movlps: ; BTVER2: # BB#0: @@ -1419,7 +1419,7 @@ ; HASWELL-LABEL: test_movmskps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movmskps: ; BTVER2: # BB#0: @@ -1465,7 +1465,7 @@ ; HASWELL-LABEL: test_movntps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movntps: ; BTVER2: # BB#0: @@ -1511,10 +1511,10 @@ ; ; HASWELL-LABEL: test_movss_mem: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movss_mem: ; BTVER2: # BB#0: @@ -1565,7 +1565,7 @@ ; HASWELL-LABEL: test_movss_reg: ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movss_reg: ; BTVER2: # BB#0: @@ -1611,10 +1611,10 @@ ; ; HASWELL-LABEL: test_movups: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movups: ; BTVER2: # BB#0: @@ -1663,8 +1663,8 @@ ; HASWELL-LABEL: test_mulps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulps: ; BTVER2: # BB#0: @@ -1711,8 +1711,8 @@ ; HASWELL-LABEL: test_mulss: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulss: ; BTVER2: # BB#0: @@ -1763,8 +1763,8 @@ ; HASWELL-LABEL: test_orps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_orps: ; BTVER2: # BB#0: @@ -1816,8 +1816,8 @@ ; ; HASWELL-LABEL: test_prefetchnta: ; HASWELL: # BB#0: -; HASWELL-NEXT: prefetchnta (%rdi) # sched: [4:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: prefetchnta (%rdi) # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_prefetchnta: ; BTVER2: # BB#0: @@ -1867,9 +1867,9 @@ ; HASWELL-LABEL: test_rcpps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [5:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rcpps: ; BTVER2: # BB#0: @@ -1929,11 +1929,11 @@ ; ; HASWELL-LABEL: test_rcpss: ; HASWELL: # BB#0: -; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] -; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] +; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rcpss: ; BTVER2: # BB#0: @@ -1994,9 +1994,9 @@ ; HASWELL-LABEL: test_rsqrtps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00] +; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [5:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rsqrtps: ; BTVER2: # BB#0: @@ -2057,10 +2057,10 @@ ; HASWELL-LABEL: test_rsqrtss: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_rsqrtss: ; BTVER2: # BB#0: @@ -2116,8 +2116,8 @@ ; ; HASWELL-LABEL: test_sfence: ; HASWELL: # BB#0: -; HASWELL-NEXT: sfence # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: sfence # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sfence: ; BTVER2: # BB#0: @@ -2165,8 +2165,8 @@ ; HASWELL-LABEL: test_shufps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00] -; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_shufps: ; BTVER2: # BB#0: @@ -2217,10 +2217,10 @@ ; ; HASWELL-LABEL: test_sqrtps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00] -; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00] +; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [14:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtps: ; BTVER2: # BB#0: @@ -2280,11 +2280,11 @@ ; ; HASWELL-LABEL: test_sqrtss: ; HASWELL: # BB#0: -; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00] -; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50] -; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00] +; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtss: ; BTVER2: # BB#0: @@ -2336,9 +2336,9 @@ ; ; HASWELL-LABEL: test_stmxcsr: ; HASWELL: # BB#0: -; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00] -; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_stmxcsr: ; BTVER2: # BB#0: @@ -2387,8 +2387,8 @@ ; HASWELL-LABEL: test_subps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subps: ; BTVER2: # BB#0: @@ -2435,8 +2435,8 @@ ; HASWELL-LABEL: test_subss: ; HASWELL: # BB#0: ; HASWELL-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subss: ; BTVER2: # BB#0: @@ -2524,7 +2524,7 @@ ; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] ; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] ; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ucomiss: ; BTVER2: # BB#0: @@ -2593,8 +2593,8 @@ ; HASWELL-LABEL: test_unpckhps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpckhps: ; BTVER2: # BB#0: @@ -2645,8 +2645,8 @@ ; HASWELL-LABEL: test_unpcklps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpcklps: ; BTVER2: # BB#0: @@ -2697,8 +2697,8 @@ ; HASWELL-LABEL: test_xorps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorps: ; BTVER2: # BB#0: Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -37,8 +37,8 @@ ; HASWELL-LABEL: test_addpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addpd: ; BTVER2: # BB#0: @@ -85,8 +85,8 @@ ; HASWELL-LABEL: test_addsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsd: ; BTVER2: # BB#0: @@ -137,9 +137,9 @@ ; HASWELL-LABEL: test_andpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andpd: ; BTVER2: # BB#0: @@ -197,9 +197,9 @@ ; HASWELL-LABEL: test_andnotpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_andnotpd: ; BTVER2: # BB#0: @@ -259,9 +259,9 @@ ; HASWELL-LABEL: test_cmppd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmppd: ; BTVER2: # BB#0: @@ -314,7 +314,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cmpsd: ; BTVER2: # BB#0: @@ -407,7 +407,7 @@ ; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] ; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] ; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_comisd: ; BTVER2: # BB#0: @@ -476,9 +476,9 @@ ; HASWELL-LABEL: test_cvtdq2pd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [4:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2pd: ; BTVER2: # BB#0: @@ -534,10 +534,10 @@ ; ; HASWELL-LABEL: test_cvtdq2ps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [3:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtdq2ps: ; BTVER2: # BB#0: @@ -592,9 +592,9 @@ ; HASWELL-LABEL: test_cvtpd2dq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtpd2dq: ; BTVER2: # BB#0: @@ -650,9 +650,9 @@ ; HASWELL-LABEL: test_cvtpd2ps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtpd2ps: ; BTVER2: # BB#0: @@ -708,9 +708,9 @@ ; HASWELL-LABEL: test_cvtps2dq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [3:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtps2dq: ; BTVER2: # BB#0: @@ -766,9 +766,9 @@ ; HASWELL-LABEL: test_cvtps2pd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtps2pd: ; BTVER2: # BB#0: @@ -824,9 +824,9 @@ ; HASWELL-LABEL: test_cvtsd2si: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00] -; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [4:1.00] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsd2si: ; BTVER2: # BB#0: @@ -883,9 +883,9 @@ ; HASWELL-LABEL: test_cvtsd2siq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00] -; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [4:1.00] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsd2siq: ; BTVER2: # BB#0: @@ -947,10 +947,10 @@ ; HASWELL-LABEL: test_cvtsd2ss: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50] ; HASWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00] ; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsd2ss: ; BTVER2: # BB#0: @@ -1008,7 +1008,7 @@ ; HASWELL-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] ; HASWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2sd: ; BTVER2: # BB#0: @@ -1064,7 +1064,7 @@ ; HASWELL-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] ; HASWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] ; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtsi2sdq: ; BTVER2: # BB#0: @@ -1125,10 +1125,10 @@ ; HASWELL-LABEL: test_cvtss2sd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00] ; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvtss2sd: ; BTVER2: # BB#0: @@ -1185,9 +1185,9 @@ ; HASWELL-LABEL: test_cvttpd2dq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00] -; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttpd2dq: ; BTVER2: # BB#0: @@ -1244,9 +1244,9 @@ ; HASWELL-LABEL: test_cvttps2dq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [3:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttps2dq: ; BTVER2: # BB#0: @@ -1300,9 +1300,9 @@ ; HASWELL-LABEL: test_cvttsd2si: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00] -; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [4:1.00] ; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttsd2si: ; BTVER2: # BB#0: @@ -1356,9 +1356,9 @@ ; HASWELL-LABEL: test_cvttsd2siq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00] -; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [4:1.00] ; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_cvttsd2siq: ; BTVER2: # BB#0: @@ -1407,9 +1407,9 @@ ; ; HASWELL-LABEL: test_divpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [20:1.00] +; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divpd: ; BTVER2: # BB#0: @@ -1455,9 +1455,9 @@ ; ; HASWELL-LABEL: test_divsd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [20:1.00] +; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [20:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_divsd: ; BTVER2: # BB#0: @@ -1505,8 +1505,8 @@ ; ; HASWELL-LABEL: test_lfence: ; HASWELL: # BB#0: -; HASWELL-NEXT: lfence # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: lfence # sched: [2:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lfence: ; BTVER2: # BB#0: @@ -1551,8 +1551,8 @@ ; ; HASWELL-LABEL: test_mfence: ; HASWELL: # BB#0: -; HASWELL-NEXT: mfence # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: mfence # sched: [2:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mfence: ; BTVER2: # BB#0: @@ -1595,8 +1595,8 @@ ; ; HASWELL-LABEL: test_maskmovdqu: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maskmovdqu: ; BTVER2: # BB#0: @@ -1640,8 +1640,8 @@ ; HASWELL-LABEL: test_maxpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxpd: ; BTVER2: # BB#0: @@ -1689,8 +1689,8 @@ ; HASWELL-LABEL: test_maxsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_maxsd: ; BTVER2: # BB#0: @@ -1738,8 +1738,8 @@ ; HASWELL-LABEL: test_minpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minpd: ; BTVER2: # BB#0: @@ -1787,8 +1787,8 @@ ; HASWELL-LABEL: test_minsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_minsd: ; BTVER2: # BB#0: @@ -1839,10 +1839,10 @@ ; ; HASWELL-LABEL: test_movapd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movapd: ; BTVER2: # BB#0: @@ -1894,10 +1894,10 @@ ; ; HASWELL-LABEL: test_movdqa: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movdqa: ; BTVER2: # BB#0: @@ -1949,10 +1949,10 @@ ; ; HASWELL-LABEL: test_movdqu: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movdqu: ; BTVER2: # BB#0: @@ -2017,12 +2017,12 @@ ; HASWELL-LABEL: test_movd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00] -; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] ; HASWELL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vmovd %xmm0, %eax # sched: [1:1.00] ; HASWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movd: ; BTVER2: # BB#0: @@ -2098,12 +2098,12 @@ ; HASWELL-LABEL: test_movd_64: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00] -; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vmovq %xmm0, %rax # sched: [1:1.00] ; HASWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movd_64: ; BTVER2: # BB#0: @@ -2166,10 +2166,10 @@ ; ; HASWELL-LABEL: test_movhpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movhpd: ; BTVER2: # BB#0: @@ -2224,10 +2224,10 @@ ; ; HASWELL-LABEL: test_movlpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movlpd: ; BTVER2: # BB#0: @@ -2277,7 +2277,7 @@ ; HASWELL-LABEL: test_movmskpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movmskpd: ; BTVER2: # BB#0: @@ -2324,7 +2324,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movntdqa: ; BTVER2: # BB#0: @@ -2371,7 +2371,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movntpd: ; BTVER2: # BB#0: @@ -2420,10 +2420,10 @@ ; ; HASWELL-LABEL: test_movq_mem: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movq_mem: ; BTVER2: # BB#0: @@ -2477,7 +2477,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33] ; HASWELL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movq_reg: ; BTVER2: # BB#0: @@ -2526,10 +2526,10 @@ ; ; HASWELL-LABEL: test_movsd_mem: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50] ; HASWELL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movsd_mem: ; BTVER2: # BB#0: @@ -2581,7 +2581,7 @@ ; HASWELL-LABEL: test_movsd_reg: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movsd_reg: ; BTVER2: # BB#0: @@ -2627,10 +2627,10 @@ ; ; HASWELL-LABEL: test_movupd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movupd: ; BTVER2: # BB#0: @@ -2679,8 +2679,8 @@ ; HASWELL-LABEL: test_mulpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulpd: ; BTVER2: # BB#0: @@ -2727,8 +2727,8 @@ ; HASWELL-LABEL: test_mulsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mulsd: ; BTVER2: # BB#0: @@ -2779,9 +2779,9 @@ ; HASWELL-LABEL: test_orpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_orpd: ; BTVER2: # BB#0: @@ -2839,8 +2839,8 @@ ; HASWELL-LABEL: test_packssdw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_packssdw: ; BTVER2: # BB#0: @@ -2893,8 +2893,8 @@ ; HASWELL-LABEL: test_packsswb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_packsswb: ; BTVER2: # BB#0: @@ -2947,8 +2947,8 @@ ; HASWELL-LABEL: test_packuswb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_packuswb: ; BTVER2: # BB#0: @@ -3001,8 +3001,8 @@ ; HASWELL-LABEL: test_paddb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddb: ; BTVER2: # BB#0: @@ -3053,8 +3053,8 @@ ; HASWELL-LABEL: test_paddd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddd: ; BTVER2: # BB#0: @@ -3101,8 +3101,8 @@ ; HASWELL-LABEL: test_paddq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddq: ; BTVER2: # BB#0: @@ -3153,8 +3153,8 @@ ; HASWELL-LABEL: test_paddsb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddsb: ; BTVER2: # BB#0: @@ -3206,8 +3206,8 @@ ; HASWELL-LABEL: test_paddsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddsw: ; BTVER2: # BB#0: @@ -3259,8 +3259,8 @@ ; HASWELL-LABEL: test_paddusb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddusb: ; BTVER2: # BB#0: @@ -3312,8 +3312,8 @@ ; HASWELL-LABEL: test_paddusw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddusw: ; BTVER2: # BB#0: @@ -3365,8 +3365,8 @@ ; HASWELL-LABEL: test_paddw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_paddw: ; BTVER2: # BB#0: @@ -3417,9 +3417,9 @@ ; HASWELL-LABEL: test_pand: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pand: ; BTVER2: # BB#0: @@ -3479,9 +3479,9 @@ ; HASWELL-LABEL: test_pandn: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pandn: ; BTVER2: # BB#0: @@ -3537,8 +3537,8 @@ ; HASWELL-LABEL: test_pavgb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pavgb: ; BTVER2: # BB#0: @@ -3590,8 +3590,8 @@ ; HASWELL-LABEL: test_pavgw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pavgw: ; BTVER2: # BB#0: @@ -3645,9 +3645,9 @@ ; HASWELL-LABEL: test_pcmpeqb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpeqb: ; BTVER2: # BB#0: @@ -3704,9 +3704,9 @@ ; HASWELL-LABEL: test_pcmpeqd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpeqd: ; BTVER2: # BB#0: @@ -3763,9 +3763,9 @@ ; HASWELL-LABEL: test_pcmpeqw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpeqw: ; BTVER2: # BB#0: @@ -3823,9 +3823,9 @@ ; HASWELL-LABEL: test_pcmpgtb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpgtb: ; BTVER2: # BB#0: @@ -3883,9 +3883,9 @@ ; HASWELL-LABEL: test_pcmpgtd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpgtd: ; BTVER2: # BB#0: @@ -3943,9 +3943,9 @@ ; HASWELL-LABEL: test_pcmpgtw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpgtw: ; BTVER2: # BB#0: @@ -3995,9 +3995,9 @@ ; ; HASWELL-LABEL: test_pextrw: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [2:1.00] ; HASWELL-NEXT: # kill: %AX %AX %EAX -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pextrw: ; BTVER2: # BB#0: @@ -4045,9 +4045,9 @@ ; ; HASWELL-LABEL: test_pinsrw: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pinsrw: ; BTVER2: # BB#0: @@ -4102,8 +4102,8 @@ ; HASWELL-LABEL: test_pmaddwd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaddwd: ; BTVER2: # BB#0: @@ -4156,8 +4156,8 @@ ; HASWELL-LABEL: test_pmaxsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaxsw: ; BTVER2: # BB#0: @@ -4209,8 +4209,8 @@ ; HASWELL-LABEL: test_pmaxub: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaxub: ; BTVER2: # BB#0: @@ -4262,8 +4262,8 @@ ; HASWELL-LABEL: test_pminsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pminsw: ; BTVER2: # BB#0: @@ -4315,8 +4315,8 @@ ; HASWELL-LABEL: test_pminub: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pminub: ; BTVER2: # BB#0: @@ -4362,7 +4362,7 @@ ; HASWELL-LABEL: test_pmovmskb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovmskb: ; BTVER2: # BB#0: @@ -4406,8 +4406,8 @@ ; HASWELL-LABEL: test_pmulhuw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulhuw: ; BTVER2: # BB#0: @@ -4455,8 +4455,8 @@ ; HASWELL-LABEL: test_pmulhw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulhw: ; BTVER2: # BB#0: @@ -4504,8 +4504,8 @@ ; HASWELL-LABEL: test_pmullw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmullw: ; BTVER2: # BB#0: @@ -4560,8 +4560,8 @@ ; HASWELL-LABEL: test_pmuludq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmuludq: ; BTVER2: # BB#0: @@ -4614,9 +4614,9 @@ ; HASWELL-LABEL: test_por: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_por: ; BTVER2: # BB#0: @@ -4674,8 +4674,8 @@ ; HASWELL-LABEL: test_psadbw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psadbw: ; BTVER2: # BB#0: @@ -4730,9 +4730,9 @@ ; HASWELL-LABEL: test_pshufd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00] -; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pshufd: ; BTVER2: # BB#0: @@ -4788,9 +4788,9 @@ ; HASWELL-LABEL: test_pshufhw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00] -; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00] +; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00] ; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pshufhw: ; BTVER2: # BB#0: @@ -4846,9 +4846,9 @@ ; HASWELL-LABEL: test_pshuflw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00] -; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00] +; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00] ; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pshuflw: ; BTVER2: # BB#0: @@ -4902,9 +4902,9 @@ ; HASWELL-LABEL: test_pslld: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pslld: ; BTVER2: # BB#0: @@ -4958,7 +4958,7 @@ ; HASWELL-LABEL: test_pslldq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pslldq: ; BTVER2: # BB#0: @@ -5005,9 +5005,9 @@ ; HASWELL-LABEL: test_psllq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psllq: ; BTVER2: # BB#0: @@ -5063,9 +5063,9 @@ ; HASWELL-LABEL: test_psllw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psllw: ; BTVER2: # BB#0: @@ -5121,9 +5121,9 @@ ; HASWELL-LABEL: test_psrad: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrad: ; BTVER2: # BB#0: @@ -5179,9 +5179,9 @@ ; HASWELL-LABEL: test_psraw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psraw: ; BTVER2: # BB#0: @@ -5237,9 +5237,9 @@ ; HASWELL-LABEL: test_psrld: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrld: ; BTVER2: # BB#0: @@ -5293,7 +5293,7 @@ ; HASWELL-LABEL: test_psrldq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrldq: ; BTVER2: # BB#0: @@ -5340,9 +5340,9 @@ ; HASWELL-LABEL: test_psrlq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrlq: ; BTVER2: # BB#0: @@ -5398,9 +5398,9 @@ ; HASWELL-LABEL: test_psrlw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [2:1.00] ; HASWELL-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psrlw: ; BTVER2: # BB#0: @@ -5456,8 +5456,8 @@ ; HASWELL-LABEL: test_psubb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubb: ; BTVER2: # BB#0: @@ -5508,8 +5508,8 @@ ; HASWELL-LABEL: test_psubd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubd: ; BTVER2: # BB#0: @@ -5556,8 +5556,8 @@ ; HASWELL-LABEL: test_psubq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubq: ; BTVER2: # BB#0: @@ -5608,8 +5608,8 @@ ; HASWELL-LABEL: test_psubsb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubsb: ; BTVER2: # BB#0: @@ -5661,8 +5661,8 @@ ; HASWELL-LABEL: test_psubsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubsw: ; BTVER2: # BB#0: @@ -5714,8 +5714,8 @@ ; HASWELL-LABEL: test_psubusb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubusb: ; BTVER2: # BB#0: @@ -5767,8 +5767,8 @@ ; HASWELL-LABEL: test_psubusw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubusw: ; BTVER2: # BB#0: @@ -5820,8 +5820,8 @@ ; HASWELL-LABEL: test_psubw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psubw: ; BTVER2: # BB#0: @@ -5872,8 +5872,8 @@ ; HASWELL-LABEL: test_punpckhbw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00] -; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckhbw: ; BTVER2: # BB#0: @@ -5926,9 +5926,9 @@ ; HASWELL-LABEL: test_punpckhdq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckhdq: ; BTVER2: # BB#0: @@ -5982,9 +5982,9 @@ ; HASWELL-LABEL: test_punpckhqdq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] -; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckhqdq: ; BTVER2: # BB#0: @@ -6038,8 +6038,8 @@ ; HASWELL-LABEL: test_punpckhwd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] -; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckhwd: ; BTVER2: # BB#0: @@ -6090,8 +6090,8 @@ ; HASWELL-LABEL: test_punpcklbw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] -; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpcklbw: ; BTVER2: # BB#0: @@ -6144,9 +6144,9 @@ ; HASWELL-LABEL: test_punpckldq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpckldq: ; BTVER2: # BB#0: @@ -6200,9 +6200,9 @@ ; HASWELL-LABEL: test_punpcklqdq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpcklqdq: ; BTVER2: # BB#0: @@ -6256,8 +6256,8 @@ ; HASWELL-LABEL: test_punpcklwd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_punpcklwd: ; BTVER2: # BB#0: @@ -6308,9 +6308,9 @@ ; HASWELL-LABEL: test_pxor: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [1:0.50] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pxor: ; BTVER2: # BB#0: @@ -6364,9 +6364,9 @@ ; HASWELL-LABEL: test_shufpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] -; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_shufpd: ; BTVER2: # BB#0: @@ -6420,10 +6420,10 @@ ; ; HASWELL-LABEL: test_sqrtpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00] -; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:1.00] +; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [21:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtpd: ; BTVER2: # BB#0: @@ -6483,11 +6483,11 @@ ; ; HASWELL-LABEL: test_sqrtsd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00] -; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50] -; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00] +; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_sqrtsd: ; BTVER2: # BB#0: @@ -6540,8 +6540,8 @@ ; HASWELL-LABEL: test_subpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subpd: ; BTVER2: # BB#0: @@ -6588,8 +6588,8 @@ ; HASWELL-LABEL: test_subsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_subsd: ; BTVER2: # BB#0: @@ -6677,7 +6677,7 @@ ; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] ; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] ; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ucomisd: ; BTVER2: # BB#0: @@ -6746,9 +6746,9 @@ ; HASWELL-LABEL: test_unpckhpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] -; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpckhpd: ; BTVER2: # BB#0: @@ -6808,9 +6808,9 @@ ; HASWELL-LABEL: test_unpcklpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_unpcklpd: ; BTVER2: # BB#0: @@ -6864,9 +6864,9 @@ ; HASWELL-LABEL: test_xorpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_xorpd: ; BTVER2: # BB#0: Index: test/CodeGen/X86/sse3-schedule.ll =================================================================== --- test/CodeGen/X86/sse3-schedule.ll +++ test/CodeGen/X86/sse3-schedule.ll @@ -37,8 +37,8 @@ ; HASWELL-LABEL: test_addsubpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubpd: ; BTVER2: # BB#0: @@ -86,8 +86,8 @@ ; HASWELL-LABEL: test_addsubps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_addsubps: ; BTVER2: # BB#0: @@ -135,8 +135,8 @@ ; HASWELL-LABEL: test_haddpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddpd: ; BTVER2: # BB#0: @@ -184,8 +184,8 @@ ; HASWELL-LABEL: test_haddps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_haddps: ; BTVER2: # BB#0: @@ -233,8 +233,8 @@ ; HASWELL-LABEL: test_hsubpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubpd: ; BTVER2: # BB#0: @@ -282,8 +282,8 @@ ; HASWELL-LABEL: test_hsubps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] -; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [5:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_hsubps: ; BTVER2: # BB#0: @@ -328,8 +328,8 @@ ; ; HASWELL-LABEL: test_lddqu: ; HASWELL: # BB#0: -; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_lddqu: ; BTVER2: # BB#0: @@ -379,7 +379,7 @@ ; HASWELL-NEXT: leaq (%rdi), %rax # sched: [1:0.50] ; HASWELL-NEXT: movl %esi, %ecx # sched: [1:0.25] ; HASWELL-NEXT: monitor # sched: [100:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_monitor: ; BTVER2: # BB#0: @@ -432,9 +432,9 @@ ; HASWELL-LABEL: test_movddup: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] -; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50] +; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [1:0.50] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movddup: ; BTVER2: # BB#0: @@ -489,9 +489,9 @@ ; HASWELL-LABEL: test_movshdup: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00] -; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50] +; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [1:0.50] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movshdup: ; BTVER2: # BB#0: @@ -546,9 +546,9 @@ ; HASWELL-LABEL: test_movsldup: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00] -; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50] +; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [1:0.50] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movsldup: ; BTVER2: # BB#0: @@ -603,8 +603,8 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: movl %edi, %ecx # sched: [1:0.25] ; HASWELL-NEXT: movl %esi, %eax # sched: [1:0.25] -; HASWELL-NEXT: mwait # sched: [100:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: mwait # sched: [20:2.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mwait: ; BTVER2: # BB#0: Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -34,8 +34,8 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33] ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendpd: ; BTVER2: # BB#0: @@ -79,8 +79,8 @@ ; HASWELL-LABEL: test_blendps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33] -; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendps: ; BTVER2: # BB#0: @@ -127,8 +127,8 @@ ; HASWELL-LABEL: test_blendvpd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] -; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvpd: ; BTVER2: # BB#0: @@ -176,8 +176,8 @@ ; HASWELL-LABEL: test_blendvps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] -; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_blendvps: ; BTVER2: # BB#0: @@ -219,8 +219,8 @@ ; HASWELL-LABEL: test_dppd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_dppd: ; BTVER2: # BB#0: @@ -262,8 +262,8 @@ ; HASWELL-LABEL: test_dpps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00] -; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [14:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_dpps: ; BTVER2: # BB#0: @@ -305,8 +305,8 @@ ; HASWELL-LABEL: test_insertps: ; HASWELL: # BB#0: ; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00] -; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_insertps: ; BTVER2: # BB#0: @@ -344,8 +344,8 @@ ; ; HASWELL-LABEL: test_movntdqa: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_movntdqa: ; BTVER2: # BB#0: @@ -382,9 +382,9 @@ ; ; HASWELL-LABEL: test_mpsadbw: ; HASWELL: # BB#0: -; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:2.00] +; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [7:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_mpsadbw: ; BTVER2: # BB#0: @@ -427,8 +427,8 @@ ; HASWELL-LABEL: test_packusdw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_packusdw: ; BTVER2: # BB#0: @@ -477,8 +477,8 @@ ; HASWELL-LABEL: test_pblendvb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] -; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pblendvb: ; BTVER2: # BB#0: @@ -521,7 +521,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00] ; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pblendw: ; BTVER2: # BB#0: @@ -562,8 +562,8 @@ ; HASWELL-LABEL: test_pcmpeqq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpeqq: ; BTVER2: # BB#0: @@ -605,9 +605,9 @@ ; ; HASWELL-LABEL: test_pextrb: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:1.00] -; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pextrb: ; BTVER2: # BB#0: @@ -648,9 +648,9 @@ ; ; HASWELL-LABEL: test_pextrd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:1.00] -; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pextrd: ; BTVER2: # BB#0: @@ -690,9 +690,9 @@ ; ; HASWELL-LABEL: test_pextrq: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:1.00] -; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [2:1.00] +; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pextrq: ; BTVER2: # BB#0: @@ -732,9 +732,9 @@ ; ; HASWELL-LABEL: test_pextrw: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:1.00] -; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [2:1.00] +; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pextrw: ; BTVER2: # BB#0: @@ -775,9 +775,9 @@ ; ; HASWELL-LABEL: test_phminposuw: ; HASWELL: # BB#0: -; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [5:1.00] ; HASWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phminposuw: ; BTVER2: # BB#0: @@ -818,9 +818,9 @@ ; ; HASWELL-LABEL: test_pinsrb: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pinsrb: ; BTVER2: # BB#0: @@ -860,9 +860,9 @@ ; ; HASWELL-LABEL: test_pinsrd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pinsrd: ; BTVER2: # BB#0: @@ -905,10 +905,10 @@ ; ; HASWELL-LABEL: test_pinsrq: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00] +; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pinsrq: ; BTVER2: # BB#0: @@ -952,8 +952,8 @@ ; HASWELL-LABEL: test_pmaxsb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaxsb: ; BTVER2: # BB#0: @@ -995,8 +995,8 @@ ; HASWELL-LABEL: test_pmaxsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaxsd: ; BTVER2: # BB#0: @@ -1038,8 +1038,8 @@ ; HASWELL-LABEL: test_pmaxud: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaxud: ; BTVER2: # BB#0: @@ -1081,8 +1081,8 @@ ; HASWELL-LABEL: test_pmaxuw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaxuw: ; BTVER2: # BB#0: @@ -1124,8 +1124,8 @@ ; HASWELL-LABEL: test_pminsb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pminsb: ; BTVER2: # BB#0: @@ -1167,8 +1167,8 @@ ; HASWELL-LABEL: test_pminsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pminsd: ; BTVER2: # BB#0: @@ -1210,8 +1210,8 @@ ; HASWELL-LABEL: test_pminud: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pminud: ; BTVER2: # BB#0: @@ -1253,8 +1253,8 @@ ; HASWELL-LABEL: test_pminuw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pminuw: ; BTVER2: # BB#0: @@ -1300,9 +1300,9 @@ ; HASWELL-LABEL: test_pmovsxbw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxbw: ; BTVER2: # BB#0: @@ -1351,9 +1351,9 @@ ; HASWELL-LABEL: test_pmovsxbd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxbd: ; BTVER2: # BB#0: @@ -1402,9 +1402,9 @@ ; HASWELL-LABEL: test_pmovsxbq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxbq: ; BTVER2: # BB#0: @@ -1453,9 +1453,9 @@ ; HASWELL-LABEL: test_pmovsxdq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxdq: ; BTVER2: # BB#0: @@ -1504,9 +1504,9 @@ ; HASWELL-LABEL: test_pmovsxwd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxwd: ; BTVER2: # BB#0: @@ -1555,9 +1555,9 @@ ; HASWELL-LABEL: test_pmovsxwq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovsxwq: ; BTVER2: # BB#0: @@ -1606,9 +1606,9 @@ ; HASWELL-LABEL: test_pmovzxbw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] -; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00] +; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [1:1.00] ; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxbw: ; BTVER2: # BB#0: @@ -1657,9 +1657,9 @@ ; HASWELL-LABEL: test_pmovzxbd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00] -; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00] +; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [1:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxbd: ; BTVER2: # BB#0: @@ -1708,9 +1708,9 @@ ; HASWELL-LABEL: test_pmovzxbq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] -; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00] +; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxbq: ; BTVER2: # BB#0: @@ -1759,9 +1759,9 @@ ; HASWELL-LABEL: test_pmovzxdq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00] -; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00] +; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxdq: ; BTVER2: # BB#0: @@ -1810,9 +1810,9 @@ ; HASWELL-LABEL: test_pmovzxwd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] -; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00] +; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [1:1.00] ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxwd: ; BTVER2: # BB#0: @@ -1861,9 +1861,9 @@ ; HASWELL-LABEL: test_pmovzxwq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00] -; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00] +; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [1:1.00] ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmovzxwq: ; BTVER2: # BB#0: @@ -1908,8 +1908,8 @@ ; HASWELL-LABEL: test_pmuldq: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmuldq: ; BTVER2: # BB#0: @@ -1953,7 +1953,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00] ; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulld: ; BTVER2: # BB#0: @@ -2011,7 +2011,7 @@ ; HASWELL-NEXT: setb %cl # sched: [1:0.50] ; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] ; HASWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_ptest: ; BTVER2: # BB#0: @@ -2065,10 +2065,10 @@ ; ; HASWELL-LABEL: test_roundpd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00] +; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [5:1.25] +; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [6:2.00] ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundpd: ; BTVER2: # BB#0: @@ -2116,10 +2116,10 @@ ; ; HASWELL-LABEL: test_roundps: ; HASWELL: # BB#0: -; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:2.00] +; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [5:1.25] +; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [6:2.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundps: ; BTVER2: # BB#0: @@ -2168,10 +2168,10 @@ ; ; HASWELL-LABEL: test_roundsd: ; HASWELL: # BB#0: -; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00] -; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00] +; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25] +; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundsd: ; BTVER2: # BB#0: @@ -2220,10 +2220,10 @@ ; ; HASWELL-LABEL: test_roundss: ; HASWELL: # BB#0: -; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00] -; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00] +; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [5:1.25] +; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00] ; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_roundss: ; BTVER2: # BB#0: Index: test/CodeGen/X86/sse42-schedule.ll =================================================================== --- test/CodeGen/X86/sse42-schedule.ll +++ test/CodeGen/X86/sse42-schedule.ll @@ -35,7 +35,7 @@ ; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00] ; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] ; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: crc32_32_8: ; BTVER2: # BB#0: @@ -84,7 +84,7 @@ ; HASWELL-NEXT: crc32w %si, %edi # sched: [3:1.00] ; HASWELL-NEXT: crc32w (%rdx), %edi # sched: [7:1.00] ; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: crc32_32_16: ; BTVER2: # BB#0: @@ -133,7 +133,7 @@ ; HASWELL-NEXT: crc32l %esi, %edi # sched: [3:1.00] ; HASWELL-NEXT: crc32l (%rdx), %edi # sched: [7:1.00] ; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: crc32_32_32: ; BTVER2: # BB#0: @@ -182,7 +182,7 @@ ; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00] ; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] ; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: crc32_64_8: ; BTVER2: # BB#0: @@ -231,7 +231,7 @@ ; HASWELL-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] ; HASWELL-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00] ; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: crc32_64_64: ; BTVER2: # BB#0: @@ -297,14 +297,14 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] ; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] -; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00] ; HASWELL-NEXT: movl %ecx, %esi # sched: [1:0.25] ; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] ; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] -; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [18:4.00] ; HASWELL-NEXT: # kill: %ECX %ECX %RCX ; HASWELL-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpestri: ; BTVER2: # BB#0: @@ -374,11 +374,11 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] ; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] -; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00] +; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00] ; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25] ; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25] -; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [10:3.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [19:4.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpestrm: ; BTVER2: # BB#0: @@ -441,7 +441,7 @@ ; HASWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00] ; HASWELL-NEXT: # kill: %ECX %ECX %RCX ; HASWELL-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpistri: ; BTVER2: # BB#0: @@ -489,9 +489,9 @@ ; ; HASWELL-LABEL: test_pcmpistrm: ; HASWELL: # BB#0: -; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00] -; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:3.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpistrm: ; BTVER2: # BB#0: @@ -534,7 +534,7 @@ ; HASWELL: # BB#0: ; HASWELL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] ; HASWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pcmpgtq: ; BTVER2: # BB#0: Index: test/CodeGen/X86/ssse3-schedule.ll =================================================================== --- test/CodeGen/X86/ssse3-schedule.ll +++ test/CodeGen/X86/ssse3-schedule.ll @@ -42,9 +42,9 @@ ; HASWELL-LABEL: test_pabsb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pabsb: ; BTVER2: # BB#0: @@ -100,9 +100,9 @@ ; HASWELL-LABEL: test_pabsd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pabsd: ; BTVER2: # BB#0: @@ -158,9 +158,9 @@ ; HASWELL-LABEL: test_pabsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpabsw (%rdi), %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpabsw (%rdi), %xmm1 # sched: [1:0.50] ; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pabsw: ; BTVER2: # BB#0: @@ -216,8 +216,8 @@ ; HASWELL-LABEL: test_palignr: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00] -; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_palignr: ; BTVER2: # BB#0: @@ -264,8 +264,8 @@ ; HASWELL-LABEL: test_phaddd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00] -; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phaddd: ; BTVER2: # BB#0: @@ -313,8 +313,8 @@ ; HASWELL-LABEL: test_phaddsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] -; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phaddsw: ; BTVER2: # BB#0: @@ -362,8 +362,8 @@ ; HASWELL-LABEL: test_phaddw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] -; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phaddw: ; BTVER2: # BB#0: @@ -411,8 +411,8 @@ ; HASWELL-LABEL: test_phsubd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00] -; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phsubd: ; BTVER2: # BB#0: @@ -460,8 +460,8 @@ ; HASWELL-LABEL: test_phsubsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] -; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phsubsw: ; BTVER2: # BB#0: @@ -509,8 +509,8 @@ ; HASWELL-LABEL: test_phsubw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00] -; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:2.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [3:2.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_phsubw: ; BTVER2: # BB#0: @@ -558,8 +558,8 @@ ; HASWELL-LABEL: test_pmaddubsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmaddubsw: ; BTVER2: # BB#0: @@ -608,8 +608,8 @@ ; HASWELL-LABEL: test_pmulhrsw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pmulhrsw: ; BTVER2: # BB#0: @@ -657,8 +657,8 @@ ; HASWELL-LABEL: test_pshufb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_pshufb: ; BTVER2: # BB#0: @@ -710,8 +710,8 @@ ; HASWELL-LABEL: test_psignb: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psignb: ; BTVER2: # BB#0: @@ -763,8 +763,8 @@ ; HASWELL-LABEL: test_psignd: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psignd: ; BTVER2: # BB#0: @@ -816,8 +816,8 @@ ; HASWELL-LABEL: test_psignw: ; HASWELL: # BB#0: ; HASWELL-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NEXT: retq # sched: [1:1.00] +; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [2:1.00] ; ; BTVER2-LABEL: test_psignw: ; BTVER2: # BB#0: Index: test/CodeGen/X86/vector-shift-ashr-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-512.ll +++ test/CodeGen/X86/vector-shift-ashr-512.ll @@ -201,14 +201,14 @@ ; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm8 ; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm9 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm9, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5 ; AVX512DQ-NEXT: vpaddw %ymm8, %ymm8, %ymm9 ; AVX512DQ-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 ; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4 ; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 @@ -328,14 +328,14 @@ ; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5 ; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm8 ; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm9 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm9, %ymm5, %ymm3 ; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5 ; AVX512DQ-NEXT: vpaddw %ymm8, %ymm8, %ymm9 ; AVX512DQ-NEXT: vpblendvb %ymm9, %ymm5, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 ; AVX512DQ-NEXT: vpsraw $2, %ymm3, %ymm4 ; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpsraw $1, %ymm3, %ymm4 Index: test/CodeGen/X86/vector-shift-lshr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-256.ll +++ test/CodeGen/X86/vector-shift-lshr-256.ll @@ -777,9 +777,9 @@ ; ; AVX512DQ-LABEL: splatvar_shift_v32i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2 Index: test/CodeGen/X86/vector-shift-shl-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-shl-256.ll +++ test/CodeGen/X86/vector-shift-shl-256.ll @@ -713,9 +713,9 @@ ; ; AVX512DQ-LABEL: splatvar_shift_v32i8: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1 ; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -68,13 +68,13 @@ ; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1] ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7] -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5 -; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7] -; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15] -; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; KNL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15] +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm5 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7],ymm0[8],ymm5[9],ymm0[10],ymm5[11],ymm0[12],ymm5[13],ymm0[14],ymm5[15] ; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3 ; KNL-NEXT: vpbroadcastw %xmm3, %ymm3 ; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]