Index: llvm/trunk/lib/Target/X86/X86FixupBWInsts.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86FixupBWInsts.cpp +++ llvm/trunk/lib/Target/X86/X86FixupBWInsts.cpp @@ -166,15 +166,85 @@ return true; } -// TODO: This method of analysis can miss some legal cases, because the -// super-register could be live into the address expression for a memory -// reference for the instruction, and still be killed/last used by the -// instruction. However, the existing query interfaces don't seem to -// easily allow that to be checked. -// -// What we'd really like to know is whether after OrigMI, the -// only portion of SuperDestReg that is alive is the portion that -// was the destination register of OrigMI. +/// Check if register \p Reg is live after the \p MI. +/// +/// \p LiveRegs should be in a state describing liveness information in +/// that exact place as this function tries to precise analysis made +/// by \p LiveRegs by exploiting the information about particular +/// instruction \p MI. \p MI is expected to be one of the MOVs handled +/// by the x86FixupBWInsts pass. +/// Note: similar to LivePhysRegs::contains this would state that +/// super-register is not used if only some part of it is used. +/// +/// X86 backend does not have subregister liveness tracking enabled, +/// so liveness information might be overly conservative. However, for +/// some specific instructions (this pass only cares about MOVs) we can +/// produce more precise results by analysing that MOV's operands. +/// +/// Indeed, if super-register is not live before the mov it means that it +/// was originally and so we are free to modify these +/// undef upper bits. That may happen in case where the use is in another MBB +/// and the vreg/physreg corresponding to the move has higher width than +/// necessary (e.g. due to register coalescing with a "truncate" copy). +/// So, it handles pattern like this: +/// +/// BB#2: derived from LLVM BB %if.then +/// Live Ins: %RDI +/// Predecessors according to CFG: BB#0 +/// %AX = MOV16rm %RDI, 1, %noreg, 0, %noreg, %EAX; mem:LD2[%p] +/// No %EAX +/// Successors according to CFG: BB#3(?%) +/// +/// BB#3: derived from LLVM BB %if.end +/// Live Ins: %EAX Only %AX is actually live +/// Predecessors according to CFG: BB#2 BB#1 +/// %AX = KILL %AX, %EAX +/// RET 0, %AX +static bool isLive(const MachineInstr &MI, + const LivePhysRegs &LiveRegs, + const TargetRegisterInfo *TRI, + unsigned Reg) { + if (!LiveRegs.contains(Reg)) + return false; + + unsigned Opc = MI.getOpcode(); (void)Opc; + // These are the opcodes currently handled by the pass, if something + // else will be added we need to ensure that new opcode has the same + // properties. + assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr || + Opc == X86::MOV16rr) && + "Unexpected opcode."); + + bool IsDefined = false; + for (auto &MO: MI.implicit_operands()) { + if (!MO.isReg()) + continue; + + assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!"); + + for (MCSuperRegIterator Supers(Reg, TRI, true); Supers.isValid(); ++Supers) { + if (*Supers == MO.getReg()) { + if (MO.isDef()) + IsDefined = true; + else + return true; // SuperReg Imp-used' -> live before the MI + } + } + } + // Reg is not Imp-def'ed -> it's live both before/after the instruction. + if (!IsDefined) + return true; + + // Otherwise, the Reg is not live before the MI and the MOV can't + // make it really live, so it's in fact dead even after the MI. + return false; +} + +/// \brief Check if after \p OrigMI the only portion of super register +/// of the destination register of \p OrigMI that is alive is that +/// destination register. +/// +/// If so, return that super register in \p SuperDestReg. bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, unsigned &SuperDestReg) const { auto *TRI = &TII->getRegisterInfo(); @@ -191,7 +261,7 @@ if (SubRegIdx == X86::sub_8bit_hi) return false; - if (LiveRegs.contains(SuperDestReg)) + if (isLive(*OrigMI, LiveRegs, TRI, SuperDestReg)) return false; if (SubRegIdx == X86::sub_8bit) { @@ -201,7 +271,7 @@ unsigned UpperByteReg = getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true); - if (LiveRegs.contains(UpperByteReg)) + if (isLive(*OrigMI, LiveRegs, TRI, UpperByteReg)) return false; } @@ -328,7 +398,7 @@ for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { MachineInstr *MI = &*I; - + if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB)) MIReplacements.push_back(std::make_pair(MI, NewMI)); Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -985,7 +985,7 @@ define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind { ; X32-LABEL: test_mm256_insert_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2031,46 +2031,46 @@ define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { ; X32-LABEL: test_mm256_set_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovd %eax, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovd %eax, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_epi16: ; X64: # BB#0: -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0 ; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 @@ -2078,21 +2078,21 @@ ; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 ; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vmovd %eax, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq @@ -2362,7 +2362,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { ; X32-LABEL: test_mm256_set1_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovd %eax, %xmm0 ; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] @@ -2668,58 +2668,58 @@ define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { ; X32-LABEL: test_mm256_setr_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovd %eax, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovd %eax, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_setr_epi16: ; X64: # BB#0: -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ; X64-NEXT: vmovd %edi, %xmm1 ; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 @@ -2727,9 +2727,9 @@ ; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 ; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 ; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll +++ llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1215,7 +1215,7 @@ ; X64: ## BB#0: ## %entry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: vmovd %eax, %xmm1 ; X64-NEXT: vpbroadcastw %xmm1, %xmm1 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) @@ -1274,7 +1274,7 @@ ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: movw (%rdi), %ax +; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: vmovd %eax, %xmm1 ; X64-NEXT: vpbroadcastw %xmm1, %ymm1 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -23,7 +23,7 @@ define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm512_mask_broadcastd_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} ; X32-NEXT: retl @@ -45,7 +45,7 @@ define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastd_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl @@ -184,7 +184,7 @@ define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) { ; X32-LABEL: test_mm512_mask_broadcastss_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} ; X32-NEXT: retl @@ -203,7 +203,7 @@ define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) { ; X32-LABEL: test_mm512_maskz_broadcastss_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl @@ -288,7 +288,7 @@ define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_movehdup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X32-NEXT: retl @@ -307,7 +307,7 @@ define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_movehdup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; X32-NEXT: retl @@ -340,7 +340,7 @@ define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_moveldup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X32-NEXT: retl @@ -359,7 +359,7 @@ define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_moveldup_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; X32-NEXT: retl @@ -444,7 +444,7 @@ define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_mask_permute_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X32-NEXT: retl @@ -463,7 +463,7 @@ define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) { ; X32-LABEL: test_mm512_maskz_permute_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; X32-NEXT: retl @@ -602,7 +602,7 @@ define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_mask_shuffle_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X32-NEXT: retl @@ -624,7 +624,7 @@ define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) { ; X32-LABEL: test_mm512_maskz_shuffle_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; X32-NEXT: retl @@ -714,7 +714,7 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X32-NEXT: retl @@ -737,7 +737,7 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X32-NEXT: retl @@ -877,7 +877,7 @@ define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X32-LABEL: test_mm512_mask_unpackhi_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] ; X32-NEXT: retl @@ -896,7 +896,7 @@ define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_maskz_unpackhi_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X32-NEXT: retl @@ -932,7 +932,7 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X32-NEXT: retl @@ -955,7 +955,7 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_epi32: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl @@ -1095,7 +1095,7 @@ define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { ; X32-LABEL: test_mm512_mask_unpacklo_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] ; X32-NEXT: retl @@ -1114,7 +1114,7 @@ define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { ; X32-LABEL: test_mm512_maskz_unpacklo_ps: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovw %eax, %k1 ; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X32-NEXT: retl Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1686,19 +1686,19 @@ ; AVX512F-32-NEXT: .Lcfi9: ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: movb %cl, %bl +; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movb %cl, %dl +; AVX512F-32-NEXT: movl %ecx, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 -; AVX512F-32-NEXT: movb %bl, %dl +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: movb %cl, %bl +; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: shrb $4, %bl ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 @@ -1739,7 +1739,7 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -1750,7 +1750,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -1781,7 +1781,7 @@ ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -1854,7 +1854,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -1866,9 +1866,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %bl +; AVX512F-32-NEXT: movl %eax, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movb %bl, %dl +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -1889,7 +1889,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $4, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -1900,7 +1900,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $5, %dl ; AVX512F-32-NEXT: andb $1, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -1912,7 +1912,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $6, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -1944,7 +1944,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -1956,9 +1956,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 @@ -2034,7 +2034,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2046,9 +2046,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %dl +; AVX512F-32-NEXT: movl %ecx, %edx ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2069,7 +2069,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2080,7 +2080,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2092,7 +2092,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2103,7 +2103,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2137,7 +2137,7 @@ ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2217,7 +2217,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %dl +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -2229,9 +2229,9 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: andb $15, %al -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 @@ -2255,7 +2255,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2267,7 +2267,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2280,7 +2280,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2315,7 +2315,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -2327,9 +2327,9 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 @@ -2570,19 +2570,19 @@ ; AVX512F-32-NEXT: .Lcfi15: ; AVX512F-32-NEXT: .cfi_offset %ebx, -8 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al -; AVX512F-32-NEXT: movb %cl, %bl +; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movb %cl, %dl +; AVX512F-32-NEXT: movl %ecx, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 -; AVX512F-32-NEXT: movb %bl, %dl +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 -; AVX512F-32-NEXT: movb %cl, %bl +; AVX512F-32-NEXT: movl %ecx, %ebx ; AVX512F-32-NEXT: shrb $4, %bl ; AVX512F-32-NEXT: shrb $3, %dl ; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2 @@ -2623,7 +2623,7 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2634,7 +2634,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2665,7 +2665,7 @@ ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2738,7 +2738,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -2750,9 +2750,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %bl +; AVX512F-32-NEXT: movl %eax, %ebx ; AVX512F-32-NEXT: andb $15, %bl -; AVX512F-32-NEXT: movb %bl, %dl +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: shrb $2, %bl ; AVX512F-32-NEXT: kmovd %ebx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2773,7 +2773,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $4, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2784,7 +2784,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $5, %dl ; AVX512F-32-NEXT: andb $1, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -2796,7 +2796,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $6, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2828,7 +2828,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -2840,9 +2840,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4 @@ -2918,7 +2918,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: andb $2, %al ; AVX512F-32-NEXT: shrb %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2930,9 +2930,9 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %dl +; AVX512F-32-NEXT: movl %ecx, %edx ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2953,7 +2953,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2964,7 +2964,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -2976,7 +2976,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -2987,7 +2987,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %cl, %al +; AVX512F-32-NEXT: movl %ecx, %eax ; AVX512F-32-NEXT: shrb $7, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3021,7 +3021,7 @@ ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 ; AVX512F-32-NEXT: movb %ch, %dl ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3101,7 +3101,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %dl +; AVX512F-32-NEXT: movl %ebx, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -3113,9 +3113,9 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: andb $15, %al -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: shrb $2, %al ; AVX512F-32-NEXT: kmovd %eax, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 @@ -3139,7 +3139,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $4, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3151,7 +3151,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $5, %al ; AVX512F-32-NEXT: andb $1, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 @@ -3164,7 +3164,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %bl, %al +; AVX512F-32-NEXT: movl %ebx, %eax ; AVX512F-32-NEXT: shrb $6, %al ; AVX512F-32-NEXT: kmovd %eax, %k1 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2 @@ -3199,7 +3199,7 @@ ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $2, %dl ; AVX512F-32-NEXT: shrb %dl ; AVX512F-32-NEXT: kmovd %edx, %k1 @@ -3211,9 +3211,9 @@ ; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] ; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 ; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-32-NEXT: movb %al, %dl +; AVX512F-32-NEXT: movl %eax, %edx ; AVX512F-32-NEXT: andb $15, %dl -; AVX512F-32-NEXT: movb %dl, %al +; AVX512F-32-NEXT: movl %edx, %eax ; AVX512F-32-NEXT: shrb $2, %dl ; AVX512F-32-NEXT: kmovd %edx, %k0 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1 Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1864,7 +1864,7 @@ ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: movw {{[0-9]+}}(%esp), %ax +; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 {%k1} {z} ; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1} ; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2 Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll @@ -23,7 +23,7 @@ define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm_mask_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1} ; X32-NEXT: retl @@ -45,7 +45,7 @@ define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm_maskz_broadcastb_epi8: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ; X32-NEXT: retl @@ -200,7 +200,7 @@ define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) { ; X32-LABEL: test_mm256_mask_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1} ; X32-NEXT: retl @@ -222,7 +222,7 @@ define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) { ; X32-LABEL: test_mm256_maskz_broadcastw_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 ; X32-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: retl Index: llvm/trunk/test/CodeGen/X86/bitreverse.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bitreverse.ll +++ llvm/trunk/test/CodeGen/X86/bitreverse.ll @@ -10,8 +10,8 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind { ; X86-LABEL: test_bitreverse_v2i16: ; X86: # BB#0: -; X86-NEXT: movw {{[0-9]+}}(%esp), %cx -; X86-NEXT: movw {{[0-9]+}}(%esp), %ax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: andl $3855, %edx # imm = 0xF0F @@ -271,7 +271,7 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind { ; X86-LABEL: test_bitreverse_i16: ; X86: # BB#0: -; X86-NEXT: movw {{[0-9]+}}(%esp), %ax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $3855, %ecx # imm = 0xF0F Index: llvm/trunk/test/CodeGen/X86/cmov.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/cmov.ll +++ llvm/trunk/test/CodeGen/X86/cmov.ll @@ -94,7 +94,7 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: testb %dl, %dl ; CHECK-NEXT: setne %bl -; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: je .LBB3_4 ; CHECK-NEXT: # BB#3: # %func_4.exit.i ; CHECK-NEXT: xorl %ecx, %ecx @@ -107,7 +107,7 @@ ; CHECK-NEXT: # BB#6: # %bb.i.i ; CHECK-NEXT: movb {{.*}}(%rip), %cl ; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: .LBB3_7: # %func_1.exit ; CHECK-NEXT: movb %cl, {{.*}}(%rip) ; CHECK-NEXT: movzbl %cl, %esi @@ -197,7 +197,7 @@ ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: jne .LBB6_2 ; CHECK-NEXT: # BB#1: -; CHECK-NEXT: movb %dl, %sil +; CHECK-NEXT: movl %edx, %esi ; CHECK-NEXT: .LBB6_2: ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/fixup-bw-inst.mir =================================================================== --- llvm/trunk/test/CodeGen/X86/fixup-bw-inst.mir +++ llvm/trunk/test/CodeGen/X86/fixup-bw-inst.mir @@ -0,0 +1,152 @@ +# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass x86-fixup-bw-insts %s -o - | FileCheck %s + +--- | + define void @test1() { ret void } + + define void @test2() { ret void } + + define i16 @test3(i16* readonly %p) { + ; Keep original IR to show how the situation like this might happen + ; due to preceding CG passes. + ; + ; %0 is used in %if.end BB (before tail-duplication), so its + ; corresponding super-register (EAX) is live-in into that BB (%if.end) + ; and also has an EAX flag. Make sure that we still change + ; the movw into movzwl because EAX is not live before the load (which + ; can be seen by the fact that EAX flag is missing). + entry: + %tobool = icmp eq i16* %p, null + br i1 %tobool, label %if.end, label %if.then + + if.then: ; preds = %entry + %0 = load i16, i16* %p, align 2 + br label %if.end + + if.end: ; preds = %if.then, %entry + %i.0 = phi i16 [ %0, %if.then ], [ 0, %entry ] + ret i16 %i.0 + } + +... +--- +# CHECK-LABEL: name: test1 +name: test1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%rax' } +frameInfo: + stackSize: 0 +fixedStack: +stack: +constants: +# Verify that "movw (%rax), %ax" is changed to "movzwl (%rax), %rax". +# +# For that to happen, the liveness information after the MOV16rm +# instruction should be used, not before it because %rax is live +# before the MOV and is killed by it. +body: | + bb.0: + liveins: %rax + + %ax = MOV16rm killed %rax, 1, _, 0, _ + ; CHECK: %eax = MOVZX32rm16 killed %rax + + RETQ %ax + +... +--- +# CHECK-LABEL: name: test2 +name: test2 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%rax' } +frameInfo: + stackSize: 0 +fixedStack: +stack: +constants: +# Imp-use of any super-register means the register is live before the MOV +body: | + bb.0: + liveins: %dl, %rbx, %rcx, %r14 + + %cl = MOV8rr killed %dl, implicit killed %rcx, implicit-def %rcx + ; CHECK: %cl = MOV8rr killed %dl, implicit killed %rcx, implicit-def %rcx + JMP_1 %bb.1 + bb.1: + liveins: %rcx + + RETQ %cl + +... +--- +# CHECK-LABEL: name: test3 +name: test3 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: + - { reg: '%rdi', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +# After MOV16rm the whole %eax is not *really* live, as can be seen by +# missing implicit-uses of it in that MOV. Make sure that MOV is +# transformed into MOVZX. +# See the comment near the original IR on what preceding decisions can +# lead to that. +body: | + bb.0.entry: + successors: %bb.1(0x30000000), %bb.2.if.then(0x50000000) + liveins: %rdi + + TEST64rr %rdi, %rdi, implicit-def %eflags + JE_1 %bb.1, implicit %eflags + + bb.2.if.then: + liveins: %rdi + + %ax = MOV16rm killed %rdi, 1, _, 0, _, implicit-def %eax :: (load 2 from %ir.p) + ; CHECK: %eax = MOVZX32rm16 killed %rdi, 1, _, 0, _, implicit-def %eax :: (load 2 from %ir.p) + %ax = KILL %ax, implicit killed %eax + RETQ %ax + + bb.1: + %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags + %ax = KILL %ax, implicit killed %eax + RETQ %ax + +... Index: llvm/trunk/test/CodeGen/X86/pr32345.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr32345.ll +++ llvm/trunk/test/CodeGen/X86/pr32345.ll @@ -95,7 +95,7 @@ ; X64-LABEL: foo: ; X64: # BB#0: # %bb ; X64-NEXT: movzwl {{.*}}(%rip), %ecx -; X64-NEXT: movw {{.*}}(%rip), %ax +; X64-NEXT: movzwl {{.*}}(%rip), %eax ; X64-NEXT: xorw %cx, %ax ; X64-NEXT: xorl %ecx, %eax ; X64-NEXT: movzwl %ax, %eax @@ -119,7 +119,7 @@ ; 686-NEXT: andl $-8, %esp ; 686-NEXT: subl $8, %esp ; 686-NEXT: movzwl var_27, %ecx -; 686-NEXT: movw var_22, %ax +; 686-NEXT: movzwl var_22, %eax ; 686-NEXT: xorw %cx, %ax ; 686-NEXT: xorl %ecx, %eax ; 686-NEXT: movzwl %ax, %eax Index: llvm/trunk/test/CodeGen/X86/pr32420.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr32420.ll +++ llvm/trunk/test/CodeGen/X86/pr32420.ll @@ -16,7 +16,7 @@ ; CHECK-NEXT: shll $12, %ecx ; CHECK-NEXT: sarw $12, %cx ; CHECK-NEXT: movq _b@{{.*}}(%rip), %rdx -; CHECK-NEXT: movw %cx, %si +; CHECK-NEXT: movl %ecx, %esi ; CHECK-NEXT: orw (%rdx), %si ; CHECK-NEXT: andl %ecx, %esi ; CHECK-NEXT: movw %si, (%rdx) Index: llvm/trunk/test/CodeGen/X86/pr34137.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr34137.ll +++ llvm/trunk/test/CodeGen/X86/pr34137.ll @@ -9,7 +9,7 @@ ; CHECK-LABEL: pr34127: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: movzwl {{.*}}(%rip), %eax -; CHECK-NEXT: movw {{.*}}(%rip), %cx +; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx ; CHECK-NEXT: andw %ax, %cx ; CHECK-NEXT: andl %eax, %ecx ; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) Index: llvm/trunk/test/CodeGen/X86/select.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/select.ll +++ llvm/trunk/test/CodeGen/X86/select.ll @@ -170,8 +170,8 @@ ; MCU-NEXT: testb $1, %al ; MCU-NEXT: jne .LBB4_2 ; MCU-NEXT: # BB#1: -; MCU-NEXT: movw {{[0-9]+}}(%esp), %cx -; MCU-NEXT: movw {{[0-9]+}}(%esp), %dx +; MCU-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; MCU-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; MCU-NEXT: .LBB4_2: ; MCU-NEXT: movw %cx, 2(%esi) ; MCU-NEXT: movw %dx, (%esi) Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1410,7 +1410,7 @@ define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind { ; X32-LABEL: test_mm_insert_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: pinsrw $1, %eax, %xmm0 ; X32-NEXT: retl ; @@ -2196,21 +2196,21 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { ; X32-LABEL: test_mm_set_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm2 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm3 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm4 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm5 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm6 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm7 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -2223,8 +2223,8 @@ ; ; X64-LABEL: test_mm_set_epi16: ; X64: # BB#0: -; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %edi, %xmm0 ; X64-NEXT: movd %esi, %xmm1 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2399,7 +2399,7 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X32-LABEL: test_mm_set1_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] @@ -2605,21 +2605,21 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { ; X32-LABEL: test_mm_setr_epi16: ; X32: # BB#0: -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm1 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm2 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm3 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm4 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm5 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm6 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm7 -; X32-NEXT: movw {{[0-9]+}}(%esp), %ax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -2632,8 +2632,8 @@ ; ; X64-LABEL: test_mm_setr_epi16: ; X64: # BB#0: -; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax -; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: movd %r10d, %xmm1 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] Index: llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll +++ llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll @@ -4303,7 +4303,7 @@ ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, %bx +; ALL-NEXT: movl %eax, %ebx ; ALL-NEXT: shll $16, %ebx ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; ALL-NEXT: callq __truncdfhf2 @@ -4328,7 +4328,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4342,7 +4342,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 @@ -4365,7 +4365,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4379,7 +4379,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 @@ -4402,7 +4402,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, %bx +; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4416,7 +4416,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, %bx +; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 @@ -4444,7 +4444,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4458,7 +4458,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 @@ -4482,7 +4482,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4496,7 +4496,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 @@ -4520,7 +4520,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bx +; AVX512F-NEXT: movl %eax, %ebx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4534,7 +4534,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bx +; AVX512F-NEXT: movl %eax, %ebx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512F-NEXT: callq __truncdfhf2 @@ -4558,7 +4558,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bx +; AVX512VL-NEXT: movl %eax, %ebx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4572,7 +4572,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bx +; AVX512VL-NEXT: movl %eax, %ebx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4603,7 +4603,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4617,7 +4617,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 @@ -4641,7 +4641,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4655,7 +4655,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 @@ -4679,7 +4679,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bx +; AVX512F-NEXT: movl %eax, %ebx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4693,7 +4693,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bx +; AVX512F-NEXT: movl %eax, %ebx ; AVX512F-NEXT: shll $16, %ebx ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512F-NEXT: callq __truncdfhf2 @@ -4717,7 +4717,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bx +; AVX512VL-NEXT: movl %eax, %ebx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4731,7 +4731,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bx +; AVX512VL-NEXT: movl %eax, %ebx ; AVX512VL-NEXT: shll $16, %ebx ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: callq __truncdfhf2 @@ -4766,7 +4766,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4780,7 +4780,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 @@ -4791,7 +4791,7 @@ ; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4805,7 +4805,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bx +; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 @@ -4833,7 +4833,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4847,7 +4847,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 @@ -4858,7 +4858,7 @@ ; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[1,0] ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4872,7 +4872,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bx +; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 @@ -4899,7 +4899,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, %bx +; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 @@ -4913,7 +4913,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, %bx +; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 @@ -4927,7 +4927,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, %bx +; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -4941,7 +4941,7 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, %bx +; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 @@ -5138,7 +5138,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bp +; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: shll $16, %ebp ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5152,7 +5152,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bp +; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: shll $16, %ebp ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 @@ -5180,7 +5180,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bp +; AVX2-NEXT: movl %eax, %ebp ; AVX2-NEXT: shll $16, %ebp ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5194,7 +5194,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bp +; AVX2-NEXT: movl %eax, %ebp ; AVX2-NEXT: shll $16, %ebp ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 @@ -5222,7 +5222,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bp +; AVX512F-NEXT: movl %eax, %ebp ; AVX512F-NEXT: shll $16, %ebp ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5236,7 +5236,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bp +; AVX512F-NEXT: movl %eax, %ebp ; AVX512F-NEXT: shll $16, %ebp ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512F-NEXT: callq __truncdfhf2 @@ -5264,7 +5264,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bp +; AVX512VL-NEXT: movl %eax, %ebp ; AVX512VL-NEXT: shll $16, %ebp ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5278,7 +5278,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bp +; AVX512VL-NEXT: movl %eax, %ebp ; AVX512VL-NEXT: shll $16, %ebp ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: callq __truncdfhf2 @@ -5314,7 +5314,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bp +; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: shll $16, %ebp ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5328,7 +5328,7 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 -; AVX1-NEXT: movw %ax, %bp +; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: shll $16, %ebp ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 @@ -5356,7 +5356,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bp +; AVX2-NEXT: movl %eax, %ebp ; AVX2-NEXT: shll $16, %ebp ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5370,7 +5370,7 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 -; AVX2-NEXT: movw %ax, %bp +; AVX2-NEXT: movl %eax, %ebp ; AVX2-NEXT: shll $16, %ebp ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 @@ -5398,7 +5398,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bp +; AVX512F-NEXT: movl %eax, %ebp ; AVX512F-NEXT: shll $16, %ebp ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5412,7 +5412,7 @@ ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movw %ax, %bp +; AVX512F-NEXT: movl %eax, %ebp ; AVX512F-NEXT: shll $16, %ebp ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512F-NEXT: callq __truncdfhf2 @@ -5440,7 +5440,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bp +; AVX512VL-NEXT: movl %eax, %ebp ; AVX512VL-NEXT: shll $16, %ebp ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512VL-NEXT: # kill: %XMM0 %XMM0 %YMM0 @@ -5454,7 +5454,7 @@ ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movw %ax, %bp +; AVX512VL-NEXT: movl %eax, %ebp ; AVX512VL-NEXT: shll $16, %ebp ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512VL-NEXT: callq __truncdfhf2