Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -460,10 +460,13 @@ std::pair getExecutionDomain(const MachineInstr &MI) const override; + std::pair + getExecutionDomainImpl(const MachineInstr &MI) const; uint16_t getExecutionDomainCustom(const MachineInstr &MI) const; void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override; + void setExecutionDomainImpl(MachineInstr &MI, unsigned Domain) const; bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7577,7 +7577,7 @@ } std::pair -X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { +X86InstrInfo::getExecutionDomainImpl(const MachineInstr &MI) const { uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; unsigned opcode = MI.getOpcode(); uint16_t validDomains = 0; @@ -7617,7 +7617,23 @@ return std::make_pair(domain, validDomains); } -void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { +std::pair +X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { + std::pair Res = getExecutionDomainImpl(MI); + + if (Res.first && Subtarget.hasAVX()) { + // Rotate the domain encodings 1-3 so that 3 becomes 1 to make integer + // prefered. + Res.first = (((Res.first - 1) + 1) % 3) + 1; + Res.second = ((Res.second & 0x6) << 1) | ((Res.second & 0x8) >> 2) | + (Res.second & 0x1); + } + + return Res; +} + +void X86InstrInfo::setExecutionDomainImpl(MachineInstr &MI, + unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); @@ -7667,6 +7683,17 @@ MI.setDesc(get(table[Domain - 1])); } +void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { + assert(Domain>0 && Domain<4 && "Invalid execution domain"); + if (Subtarget.hasAVX()) { + // Rotate the domain encoding so that domain 1 becomes 3. So that we prefer + // integer instructions. + Domain = ((Domain - 1) + 2) % 3 + 1; + } + + setExecutionDomainImpl(MI, Domain); +} + /// Return the noop instruction to use for a noop. void X86InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); Index: llvm/test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- llvm/test/CodeGen/X86/vector-half-conversions.ll +++ llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -655,14 +655,14 @@ ; ALL-LABEL: cvt_2f64_to_2i16: ; ALL: # %bb.0: ; ALL-NEXT: subq $40, %rsp -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 +; ALL-NEXT: vmovdqa (%rsp), %xmm0 ; ALL-NEXT: addq $40, %rsp ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> @@ -671,62 +671,166 @@ } define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_4i16: -; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $88, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 +; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 +; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> ret <4 x i16> %2 } define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $88, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 +; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 +; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -734,31 +838,83 @@ } define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: subq $88, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: addq $88, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $88, %rsp +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: addq $88, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_8i16_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> @@ -772,20 +928,20 @@ ; AVX1-NEXT: pushq %r14 ; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: subq $64, %rsp -; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r15d ; AVX1-NEXT: orl %ebx, %r15d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -793,7 +949,7 @@ ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r14d ; AVX1-NEXT: orl %ebx, %r14d @@ -804,13 +960,13 @@ ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %r15d ; AVX1-NEXT: orl %ebx, %r15d -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -818,7 +974,7 @@ ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebx ; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movzwl %ax, %eax ; AVX1-NEXT: orl %ebx, %eax @@ -839,14 +995,14 @@ ; AVX2-NEXT: pushq %r14 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: subq $64, %rsp -; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 @@ -860,7 +1016,7 @@ ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %r14d ; AVX2-NEXT: orl %ebx, %r14d @@ -871,7 +1027,7 @@ ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 @@ -885,7 +1041,7 @@ ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebx ; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movzwl %ax, %eax ; AVX2-NEXT: orl %ebx, %eax @@ -912,7 +1068,7 @@ ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 @@ -926,7 +1082,7 @@ ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r14d ; AVX512-NEXT: orl %ebx, %r14d @@ -940,7 +1096,7 @@ ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 @@ -954,7 +1110,7 @@ ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebx ; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %eax ; AVX512-NEXT: orl %ebx, %eax @@ -1003,7 +1159,7 @@ ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movl %eax, %ebp -; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; ALL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; ALL-NEXT: callq __truncdfhf2 ; ALL-NEXT: movw %ax, (%rbx) ; ALL-NEXT: movw %bp, 2(%rbx) @@ -1031,19 +1187,19 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, 4(%rbx) ; AVX1-NEXT: movw %bp, (%rbx) @@ -1076,12 +1232,12 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, 4(%rbx) ; AVX2-NEXT: movw %bp, (%rbx) @@ -1114,12 +1270,12 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, 4(%rbx) ; AVX512-NEXT: movw %bp, (%rbx) @@ -1138,35 +1294,95 @@ } define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $80, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $80, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $80, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rbx) +; AVX1-NEXT: addq $80, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $80, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rbx) +; AVX2-NEXT: addq $80, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $80, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512-NEXT: addq $80, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -1175,35 +1391,95 @@ } define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f64_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $80, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2 -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $80, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $80, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2 +; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovdqa %xmm0, (%rbx) +; AVX1-NEXT: addq $80, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $80, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2 +; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovdqa %xmm0, (%rbx) +; AVX2-NEXT: addq $80, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $80, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512-NEXT: addq $80, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> @@ -1222,13 +1498,13 @@ ; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: subq $136, %rsp ; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -1239,27 +1515,27 @@ ; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r12d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r13d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2 ; AVX1-NEXT: movw %ax, 12(%rbx) ; AVX1-NEXT: movw %r15w, 8(%rbx) @@ -1290,7 +1566,7 @@ ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: subq $136, %rsp ; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper @@ -1314,20 +1590,20 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r13d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2 ; AVX2-NEXT: movw %ax, 12(%rbx) ; AVX2-NEXT: movw %r15w, 8(%rbx) @@ -1384,20 +1660,20 @@ ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, 12(%rbx) ; AVX512-NEXT: movw %r15w, 8(%rbx) Index: llvm/test/CodeGen/X86/vector-idiv.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv.ll +++ llvm/test/CodeGen/X86/vector-idiv.ll @@ -12,7 +12,7 @@ ; ; AVX-LABEL: test_urem_unary_v2i16: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0 %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1 Index: llvm/test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -1671,12 +1671,12 @@ ; ; NOBW-LABEL: foldv2i64: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64: @@ -1695,12 +1695,12 @@ ; ; NOBW-LABEL: foldv2i64u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64u: @@ -1719,12 +1719,12 @@ ; ; NOBW-LABEL: foldv4i32: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32: @@ -1743,12 +1743,12 @@ ; ; NOBW-LABEL: foldv4i32u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv4i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [23,0,32,24] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32u: @@ -1767,12 +1767,12 @@ ; ; NOBW-LABEL: foldv8i16: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv8i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: @@ -1791,12 +1791,12 @@ ; ; NOBW-LABEL: foldv8i16u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv8i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: @@ -1815,12 +1815,12 @@ ; ; NOBW-LABEL: foldv16i8: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv16i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: @@ -1839,12 +1839,12 @@ ; ; NOBW-LABEL: foldv16i8u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv16i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: Index: llvm/test/CodeGen/X86/vector-lzcnt-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -1102,12 +1102,12 @@ define <4 x i64> @foldv4i64() nounwind { ; X64-LABEL: foldv4i64: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out @@ -1116,12 +1116,12 @@ define <4 x i64> @foldv4i64u() nounwind { ; X64-LABEL: foldv4i64u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,64,56] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out @@ -1130,12 +1130,12 @@ define <8 x i32> @foldv8i32() nounwind { ; X64-LABEL: foldv8i32: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out @@ -1144,12 +1144,12 @@ define <8 x i32> @foldv8i32u() nounwind { ; X64-LABEL: foldv8i32u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] ; X32-AVX-NEXT: retl %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out @@ -1158,12 +1158,12 @@ define <16 x i16> @foldv16i16() nounwind { ; X64-LABEL: foldv16i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out @@ -1172,12 +1172,12 @@ define <16 x i16> @foldv16i16u() nounwind { ; X64-LABEL: foldv16i16u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] ; X32-AVX-NEXT: retl %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out @@ -1186,12 +1186,12 @@ define <32 x i8> @foldv32i8() nounwind { ; X64-LABEL: foldv32i8: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out @@ -1200,12 +1200,12 @@ define <32 x i8> @foldv32i8u() nounwind { ; X64-LABEL: foldv32i8u: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] ; X32-AVX-NEXT: retl %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out Index: llvm/test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -593,17 +593,17 @@ ; ; AVX-LABEL: foldv2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [1,64] ; BITALG-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> ) ret <2 x i64> %out @@ -617,17 +617,17 @@ ; ; AVX-LABEL: foldv4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [1,32,0,8] ; BITALG-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> ) ret <4 x i32> %out @@ -641,17 +641,17 @@ ; ; AVX-LABEL: foldv8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] ; BITALG-NEXT: retq %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> ) ret <8 x i16> %out @@ -665,17 +665,17 @@ ; ; AVX-LABEL: foldv16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] ; BITALG-NEXT: retq %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> ) ret <16 x i8> %out Index: llvm/test/CodeGen/X86/vector-popcnt-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -308,7 +308,7 @@ define <4 x i64> @foldv4i64() nounwind { ; ALL-LABEL: foldv4i64: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [1,64,0,8] ; ALL-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> ) ret <4 x i64> %out @@ -317,7 +317,7 @@ define <8 x i32> @foldv8i32() nounwind { ; ALL-LABEL: foldv8i32: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] ; ALL-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> ) ret <8 x i32> %out @@ -326,7 +326,7 @@ define <16 x i16> @foldv16i16() nounwind { ; ALL-LABEL: foldv16i16: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] ; ALL-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> ) ret <16 x i16> %out @@ -335,7 +335,7 @@ define <32 x i8> @foldv32i8() nounwind { ; ALL-LABEL: foldv32i8: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] ; ALL-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> ) ret <32 x i8> %out Index: llvm/test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -16,9 +16,9 @@ ; ; KNL-LABEL: expand: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> ret <8 x float> %res @@ -50,9 +50,9 @@ ; CHECK-LABEL: expand2: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] -; CHECK-NEXT: vmovaps %xmm0, %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> ret <4 x double> %res @@ -70,9 +70,9 @@ ; ; KNL-LABEL: expand3: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastsd %xmm0, %ymm0 -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] +; KNL-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> ret <8 x i32> %res @@ -91,9 +91,9 @@ ; KNL-LABEL: expand4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; KNL-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] -; KNL-NEXT: vmovaps %xmm0, %xmm0 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; KNL-NEXT: vperm2i128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] +; KNL-NEXT: vmovdqa %xmm0, %xmm0 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> ret <4 x i64> %res @@ -111,9 +111,9 @@ ; ; KNL-LABEL: expand5: ; KNL: # %bb.0: -; KNL-NEXT: vbroadcastss %xmm0, %ymm0 -; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; KNL-NEXT: vpbroadcastd %xmm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; KNL-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> ret <8 x float> %res @@ -123,8 +123,8 @@ define <8 x float> @expand6(<4 x float> %a ) { ; CHECK-LABEL: expand6: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> ret <8 x float> %res @@ -248,8 +248,8 @@ define <16 x float> @expand13(<8 x float> %a ) { ; CHECK-LABEL: expand13: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> ret <16 x float> %res @@ -284,16 +284,16 @@ ; SKX-LABEL: expand15: ; SKX: # %bb.0: ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; SKX-NEXT: vmovaps {{.*#+}} ymm1 = -; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; SKX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = +; SKX-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; SKX-NEXT: ret{{[l|q]}} ; ; KNL-LABEL: expand15: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; KNL-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; KNL-NEXT: ret{{[l|q]}} %addV = fadd <4 x float> , %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> @@ -617,30 +617,30 @@ define void @PR43170(<16 x float>* %a0) { ; SKX64-LABEL: PR43170: ; SKX64: # %bb.0: # %entry -; SKX64-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; SKX64-NEXT: vmovaps %zmm0, (%rdi) +; SKX64-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; SKX64-NEXT: vmovdqa64 %zmm0, (%rdi) ; SKX64-NEXT: vzeroupper ; SKX64-NEXT: retq ; ; KNL64-LABEL: PR43170: ; KNL64: # %bb.0: # %entry -; KNL64-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; KNL64-NEXT: vmovaps %zmm0, (%rdi) +; KNL64-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; KNL64-NEXT: vmovdqa64 %zmm0, (%rdi) ; KNL64-NEXT: retq ; ; SKX32-LABEL: PR43170: ; SKX32: # %bb.0: # %entry ; SKX32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX32-NEXT: vmovaps src1, %ymm0 -; SKX32-NEXT: vmovaps %zmm0, (%eax) +; SKX32-NEXT: vmovdqa src1, %ymm0 +; SKX32-NEXT: vmovdqa64 %zmm0, (%eax) ; SKX32-NEXT: vzeroupper ; SKX32-NEXT: retl ; ; KNL32-LABEL: PR43170: ; KNL32: # %bb.0: # %entry ; KNL32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL32-NEXT: vmovaps src1, %ymm0 -; KNL32-NEXT: vmovaps %zmm0, (%eax) +; KNL32-NEXT: vmovdqa src1, %ymm0 +; KNL32-NEXT: vmovdqa64 %zmm0, (%eax) ; KNL32-NEXT: retl entry: %0 = load <8 x float>, <8 x float>* bitcast (%union1* @src1 to <8 x float>*), align 64 Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -32,24 +32,56 @@ } define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) { -; CHECK-LABEL: combine_vpermilvar_4f32_movddup: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_4f32_movddup: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_4f32_movddup: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_4f32_movddup: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 } define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) { -; X86-LABEL: combine_vpermilvar_4f32_movddup_load: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: retl +; X86-AVX1-LABEL: combine_vpermilvar_4f32_movddup_load: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: combine_vpermilvar_4f32_movddup_load: -; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X64-NEXT: retq +; X86-AVX2-LABEL: combine_vpermilvar_4f32_movddup_load: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq (%eax), %xmm0 +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: combine_vpermilvar_4f32_movddup_load: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vpbroadcastq (%eax), %xmm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: combine_vpermilvar_4f32_movddup_load: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: combine_vpermilvar_4f32_movddup_load: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: combine_vpermilvar_4f32_movddup_load: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-AVX512-NEXT: retq %1 = load <4 x float>, <4 x float> *%a0 %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) ret <4 x float> %2 @@ -76,7 +108,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_unpckh: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 @@ -85,7 +117,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_unpckl: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 @@ -118,12 +150,12 @@ ; ; AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> @@ -132,10 +164,15 @@ } define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { -; AVX-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; AVX-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: ; AVX512: # %bb.0: @@ -152,10 +189,20 @@ } define <4 x double> @combine_vperm2f128_vpermilvar_as_vperm2f128(<4 x double> %a0) { -; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: -; CHECK: # %bb.0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vperm2f128_vpermilvar_as_vperm2f128: +; AVX512: # %bb.0: +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> ) @@ -165,7 +212,7 @@ define <4 x double> @combine_vperm2f128_vpermilvar_as_vmovaps(<4 x double> %a0) { ; CHECK-LABEL: combine_vperm2f128_vpermilvar_as_vmovaps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> @@ -245,10 +292,20 @@ } define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) { -; CHECK-LABEL: combine_vpermilvar_2f64_movddup: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_2f64_movddup: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_2f64_movddup: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_2f64_movddup: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> ) ret <2 x double> %1 } @@ -274,7 +331,7 @@ define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_4stage: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) @@ -284,10 +341,20 @@ } define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) { -; CHECK-LABEL: combine_vpermilvar_8f32_4stage: -; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] -; CHECK-NEXT: ret{{[l|q]}} +; AVX1-LABEL: combine_vpermilvar_8f32_4stage: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermilvar_8f32_4stage: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_vpermilvar_8f32_4stage: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> ) %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> ) @@ -308,7 +375,7 @@ define <2 x double> @constant_fold_vpermilvar_pd() { ; CHECK-LABEL: constant_fold_vpermilvar_pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2.0E+0,1.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> , <2 x i64> ) ret <2 x double> %1 @@ -317,7 +384,7 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() { ; CHECK-LABEL: constant_fold_vpermilvar_pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> , <4 x i64> ) ret <4 x double> %1 @@ -326,7 +393,7 @@ define <4 x float> @constant_fold_vpermilvar_ps() { ; CHECK-LABEL: constant_fold_vpermilvar_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> , <4 x i32> ) ret <4 x float> %1 @@ -335,7 +402,7 @@ define <8 x float> @constant_fold_vpermilvar_ps_256() { ; CHECK-LABEL: constant_fold_vpermilvar_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> , <8 x i32> ) ret <8 x float> %1 @@ -447,13 +514,13 @@ ; AVX2-LABEL: concat_self_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: concat_self_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512-NEXT: ret{{[l|q]}} %cat = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %s = shufflevector <4 x i64> %cat, <4 x i64> undef, <4 x i32> Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -12,7 +12,7 @@ define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_pslldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> @@ -22,7 +22,7 @@ define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_psrldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> @@ -116,8 +116,8 @@ define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) { ; CHECK-LABEL: combine_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> ) @@ -128,8 +128,8 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) { ; CHECK-LABEL: combine_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) @@ -140,7 +140,7 @@ define <32 x i8> @combine_permq_pshufb_as_vmovaps(<4 x i64> %a0) { ; CHECK-LABEL: combine_permq_pshufb_as_vmovaps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> %2 = bitcast <4 x i64> %1 to <32 x i8> @@ -151,8 +151,8 @@ define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) { ; CHECK-LABEL: combine_permq_pshufb_as_vpblendd: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> %2 = bitcast <4 x i64> %1 to <32 x i8> @@ -244,7 +244,7 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) { ; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> ) ret <16 x i8> %1 @@ -271,7 +271,7 @@ define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) { ; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = bitcast <4 x float> %a to <16 x i8> %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) @@ -282,7 +282,7 @@ define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) { ; CHECK-LABEL: combine_permps_as_vpbroadcastss256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) @@ -292,7 +292,7 @@ define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) { ; CHECK-LABEL: combine_permps_as_vpbroadcastsd256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> %2 = bitcast <4 x double> %1 to <8 x float> @@ -324,7 +324,7 @@ define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) { ; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer %2 = bitcast <4 x float> %1 to <16 x i8> @@ -336,7 +336,7 @@ define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) { ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer) @@ -346,7 +346,7 @@ define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) { ; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer %2 = bitcast <4 x double> %1 to <8 x float> @@ -358,7 +358,7 @@ define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) { ; CHECK-LABEL: combine_permd_as_permq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> ) ret <8 x i32> %1 @@ -367,7 +367,7 @@ define <8 x float> @combine_permps_as_permpd(<8 x float> %a) { ; CHECK-LABEL: combine_permps_as_permpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,0,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> ) ret <8 x float> %1 @@ -422,8 +422,8 @@ define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) { ; CHECK-LABEL: combine_pshufb_as_vzmovl_32: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: ret{{[l|q]}} %1 = bitcast <8 x float> %a0 to <32 x i8> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> ) @@ -543,12 +543,12 @@ define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) { ; X86-LABEL: combine_psrlw_pshufb: ; X86: # %bb.0: -; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_psrlw_pshufb: ; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = lshr <16 x i16> %a0, %2 = bitcast <16 x i16> %1 to <32 x i8> @@ -559,12 +559,12 @@ define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) { ; X86-LABEL: combine_pslld_pshufb: ; X86: # %bb.0: -; X86-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; X86-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_pslld_pshufb: ; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %1 = shl <8 x i32> %a0, %2 = bitcast <8 x i32> %1 to <32 x i8> @@ -686,7 +686,7 @@ define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) { ; X86-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: ; X86: # %bb.0: -; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64: @@ -703,7 +703,7 @@ define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) { ; X86-LABEL: combine_permd_insertion_as_broadcast_v4i64: ; X86: # %bb.0: -; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64: @@ -720,7 +720,7 @@ define <32 x i8> @combine_pshufb_pshufb_or_as_blend(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: combine_pshufb_pshufb_or_as_blend: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a1, <32 x i8> ) @@ -742,7 +742,7 @@ define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) @@ -754,7 +754,7 @@ define <8 x i32> @constant_fold_permd() { ; CHECK-LABEL: constant_fold_permd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> , <8 x i32> ) ret <8 x i32> %1 @@ -763,7 +763,7 @@ define <8 x float> @constant_fold_permps() { ; CHECK-LABEL: constant_fold_permps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> , <8 x i32> ) ret <8 x float> %1 @@ -772,7 +772,7 @@ define <32 x i8> @constant_fold_pshufb_256() { ; CHECK-LABEL: constant_fold_pshufb_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250> ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> , <32 x i8> ) ret <32 x i8> %1 @@ -782,9 +782,9 @@ ; X86-LABEL: broadcast_v2i64_multiuse: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X86-NEXT: vextractps $2, %xmm0, %eax +; X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-NEXT: vpextrd $2, %xmm0, %eax ; X86-NEXT: addl (%ecx), %eax ; X86-NEXT: retl ; @@ -819,12 +819,12 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) { ; AVX2-LABEL: PR34577: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: ret{{[l|q]}} ; ; AVX512-LABEL: PR34577: @@ -848,15 +848,15 @@ define void @packss_zext_v8i1() { ; X86-LABEL: packss_zext_v8i1: ; X86: # %bb.0: -; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X86-NEXT: vmovups %ymm0, (%eax) +; X86-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovdqu %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: packss_zext_v8i1: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovups %ymm0, (%rax) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqu %ymm0, (%rax) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp0 = icmp sgt <8 x i32> undef, undef Index: llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -519,7 +519,7 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) ret <16 x float> %res0 @@ -528,12 +528,12 @@ ; X86-LABEL: combine_vpermt2var_16f32_vpermilps_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X86-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X64-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; X64-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) @@ -627,7 +627,7 @@ define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) { ; CHECK-LABEL: combine_permvar_as_vpbroadcastd512: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer) ret <16 x i32> %1 @@ -636,7 +636,7 @@ define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) { ; CHECK-LABEL: combine_permvar_as_vpbroadcastq512: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer) ret <8 x i64> %1 @@ -645,7 +645,7 @@ define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) { ; CHECK-LABEL: combine_permvar_8i64_as_permq: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> ) ret <8 x i64> %1 @@ -689,7 +689,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) { ; CHECK-LABEL: combine_permvar_8f64_as_permpd: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> ) ret <8 x double> %1 @@ -788,7 +788,7 @@ define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] +; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> , <16 x float> %a1, i16 -1) ret <16 x float> %res0 @@ -797,7 +797,7 @@ define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> , <16 x i32> %a1, i16 -1) ret <16 x i32> %res0 @@ -806,14 +806,14 @@ define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) { ; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 -1) %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> , <8 x double> %res0, i8 -1) @@ -823,14 +823,14 @@ define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) { ; X86-LABEL: combine_vpermt2var_8i64_as_vpermq: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %x0, <8 x i64> %x1, i8 -1) %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> , <8 x i64> %res0, <8 x i64> %res0, i8 -1) @@ -840,8 +840,8 @@ define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x1, i16 -1) %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> , <16 x float> %res0, i16 -1) @@ -851,8 +851,8 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] +; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %x0, <16 x i32> %x1, i16 -1) %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> , <16 x i32> %res0, <16 x i32> %res0, i16 -1) @@ -909,7 +909,7 @@ ; ; X64-LABEL: combine_vpermi2var_8f64_as_permpd: ; X64: # %bb.0: -; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6] +; X64-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6] ; X64-NEXT: retq %res0 = insertelement <8 x i64> , i64 %a2, i32 0 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %res0, <8 x double> %x1, i8 -1) @@ -962,7 +962,7 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) { ; X86-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: ; X86: # %bb.0: -; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm0 ; X86-NEXT: retl ; ; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: Index: llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -17,7 +17,7 @@ ; ; AVX-LABEL: combine_vpshufb_as_zero: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> ) @@ -51,10 +51,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_movsd: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_movsd: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_movsd: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_movsd: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: retq %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = bitcast <2 x double> %1 to <16 x i8> %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) @@ -73,10 +83,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_movss: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_movss: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_movss: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_movss: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512F-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> %2 = bitcast <4 x float> %1 to <16 x i8> %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> ) @@ -134,11 +154,23 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_as_vzmovl_32: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_as_vzmovl_32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_as_vzmovl_32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_as_vzmovl_32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512F-NEXT: retq %1 = bitcast <4 x float> %a0 to <16 x i8> %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) %3 = bitcast <16 x i8> %2 to <4 x float> @@ -204,7 +236,7 @@ ; ; AVX-LABEL: combine_pshufb_palignr: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) @@ -219,7 +251,7 @@ ; ; AVX-LABEL: combine_pshufb_pslldq: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> @@ -234,7 +266,7 @@ ; ; AVX-LABEL: combine_pshufb_psrldq: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> @@ -497,8 +529,8 @@ ; ; AVX-LABEL: combine_pshufb_as_unpacklo_zero: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) ret <16 x i8> %1 @@ -689,10 +721,20 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_pshufb_pshufb_or_as_blend: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_pshufb_pshufb_or_as_blend: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pshufb_pshufb_or_as_blend: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_pshufb_pshufb_or_as_blend: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> ) %3 = or <16 x i8> %1, %2 @@ -723,17 +765,17 @@ ; ; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_pshufb_pshufb_or_pshufb: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512F-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) @@ -750,7 +792,7 @@ ; ; AVX-LABEL: constant_fold_pshufb: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9> ; AVX-NEXT: retq %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 Index: llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -15,7 +15,7 @@ define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: combine_vpermil2pd_identity: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> , i8 0) %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> , i8 0) @@ -25,7 +25,7 @@ define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: combine_vpermil2pd256_identity: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> , i8 0) %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> , i8 0) @@ -45,7 +45,7 @@ define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps_identity: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> , i8 0) %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> , i8 0) @@ -76,7 +76,7 @@ define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps256_identity: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> , i8 0) %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> , i8 0) @@ -96,18 +96,24 @@ define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps256_zero: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> , i8 2) ret <8 x float> %res0 } define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: combine_vpermil2ps_blend_with_zero: -; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; CHECK-NEXT: ret{{[l|q]}} +; AVX-LABEL: combine_vpermil2ps_blend_with_zero: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_vpermil2ps_blend_with_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: ret{{[l|q]}} %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> , i8 2) ret <4 x float> %res0 } @@ -131,14 +137,23 @@ } define <4 x double> @demandedelts_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x double> %a1, i64 %a2) { -; X86-LABEL: demandedelts_vpermil2pd256_as_shufpd: -; X86: # %bb.0: -; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; X86-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm2, %ymm2 -; X86-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0 -; X86-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3] -; X86-NEXT: retl +; X86-AVX-LABEL: demandedelts_vpermil2pd256_as_shufpd: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; X86-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; X86-AVX-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm2, %ymm2 +; X86-AVX-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0 +; X86-AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3] +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: demandedelts_vpermil2pd256_as_shufpd: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; X86-AVX2-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,3] +; X86-AVX2-NEXT: retl ; ; X64-LABEL: demandedelts_vpermil2pd256_as_shufpd: ; X64: # %bb.0: @@ -153,7 +168,7 @@ define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_identity: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> ) @@ -163,7 +178,7 @@ define <16 x i8> @combine_vpperm_zero(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_zero: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> ) %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> ) @@ -273,12 +288,19 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) { -; X86-LABEL: buildvector_v4f32_0404: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: vmovaps %xmm0, (%eax) -; X86-NEXT: retl +; X86-AVX-LABEL: buildvector_v4f32_0404: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X86-AVX2-LABEL: buildvector_v4f32_0404: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 +; X86-AVX2-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX2-NEXT: retl ; ; X64-AVX-LABEL: buildvector_v4f32_0404: ; X64-AVX: # %bb.0: @@ -327,7 +349,7 @@ define <2 x double> @constant_fold_vpermil2pd() { ; CHECK-LABEL: constant_fold_vpermil2pd: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [-2.0E+0,2.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> , <2 x double> , <2 x i64> , i8 2) ret <2 x double> %1 @@ -336,7 +358,7 @@ define <4 x double> @constant_fold_vpermil2pd_256() { ; CHECK-LABEL: constant_fold_vpermil2pd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> , <4 x double> , <4 x i64> , i8 2) ret <4 x double> %1 @@ -345,7 +367,7 @@ define <4 x float> @constant_fold_vpermil2ps() { ; CHECK-LABEL: constant_fold_vpermil2ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> , <4 x float> , <4 x i32> , i8 2) ret <4 x float> %1 @@ -354,7 +376,7 @@ define <8 x float> @constant_fold_vpermil2ps_256() { ; CHECK-LABEL: constant_fold_vpermil2ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> , <8 x float> , <8 x i32> , i8 2) ret <8 x float> %1 @@ -363,7 +385,7 @@ define <16 x i8> @constant_fold_vpperm() { ; CHECK-LABEL: constant_fold_vpperm: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> , <16 x i8> , <16 x i8> ) ret <16 x i8> %1 Index: llvm/test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -99,12 +99,12 @@ ; ; AVX1-LABEL: combine_pshufd6: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_pshufd6: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq entry: %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) @@ -176,8 +176,8 @@ ; ; AVX-LABEL: combine_bitwise_ops_test1: ; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -194,8 +194,8 @@ ; ; AVX-LABEL: combine_bitwise_ops_test2: ; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -212,8 +212,8 @@ ; ; AVX-LABEL: combine_bitwise_ops_test3: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -230,8 +230,8 @@ ; ; AVX-LABEL: combine_bitwise_ops_test4: ; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -248,8 +248,8 @@ ; ; AVX-LABEL: combine_bitwise_ops_test5: ; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -266,8 +266,8 @@ ; ; AVX-LABEL: combine_bitwise_ops_test6: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -302,11 +302,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test1b: -; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test1b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test1b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %and = and <4 x i32> %shuf1, %shuf2 @@ -336,11 +342,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test2b: -; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test2b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test2b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %or = or <4 x i32> %shuf1, %shuf2 @@ -367,12 +379,19 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test3b: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test3b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test3b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> %xor = xor <4 x i32> %shuf1, %shuf2 @@ -402,11 +421,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test4b: -; AVX: # %bb.0: -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test4b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test4b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %and = and <4 x i32> %shuf1, %shuf2 @@ -436,11 +461,17 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test5b: -; AVX: # %bb.0: -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test5b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test5b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %or = or <4 x i32> %shuf1, %shuf2 @@ -467,12 +498,19 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_bitwise_ops_test6b: -; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_bitwise_ops_test6b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test6b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> %xor = xor <4 x i32> %shuf1, %shuf2 @@ -627,7 +665,7 @@ ; ; AVX-LABEL: combine_nested_undef_test1: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -642,7 +680,7 @@ ; ; AVX-LABEL: combine_nested_undef_test2: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -657,7 +695,7 @@ ; ; AVX-LABEL: combine_nested_undef_test3: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -672,12 +710,12 @@ ; ; AVX1-LABEL: combine_nested_undef_test4: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -692,7 +730,7 @@ ; ; AVX-LABEL: combine_nested_undef_test5: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -707,7 +745,7 @@ ; ; AVX-LABEL: combine_nested_undef_test6: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -722,7 +760,7 @@ ; ; AVX-LABEL: combine_nested_undef_test7: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -737,7 +775,7 @@ ; ; AVX-LABEL: combine_nested_undef_test8: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -752,7 +790,7 @@ ; ; AVX-LABEL: combine_nested_undef_test9: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -767,7 +805,7 @@ ; ; AVX-LABEL: combine_nested_undef_test10: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -782,7 +820,7 @@ ; ; AVX-LABEL: combine_nested_undef_test11: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -797,12 +835,12 @@ ; ; AVX1-LABEL: combine_nested_undef_test12: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test12: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -828,7 +866,7 @@ ; ; AVX-LABEL: combine_nested_undef_test14: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -866,16 +904,16 @@ ; ; AVX1-LABEL: combine_nested_undef_test15: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test15: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -903,11 +941,17 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test16: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -932,11 +976,17 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test17: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test17: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -950,7 +1000,7 @@ ; ; AVX-LABEL: combine_nested_undef_test18: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -976,11 +1026,17 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test19: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test19: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test19: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1007,11 +1063,17 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test20: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test20: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1038,14 +1100,14 @@ ; ; AVX1-LABEL: combine_nested_undef_test21: ; AVX1: # %bb.0: -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test21: ; AVX2: # %bb.0: -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1064,7 +1126,7 @@ ; ; AVX-LABEL: combine_nested_undef_test22: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1079,7 +1141,7 @@ ; ; AVX-LABEL: combine_nested_undef_test23: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1094,7 +1156,7 @@ ; ; AVX-LABEL: combine_nested_undef_test24: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1109,12 +1171,12 @@ ; ; AVX1-LABEL: combine_nested_undef_test25: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test25: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1129,7 +1191,7 @@ ; ; AVX-LABEL: combine_nested_undef_test26: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1144,12 +1206,12 @@ ; ; AVX1-LABEL: combine_nested_undef_test27: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test27: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1164,7 +1226,7 @@ ; ; AVX-LABEL: combine_nested_undef_test28: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1179,7 +1241,7 @@ ; ; AVX-LABEL: combine_test1: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1204,10 +1266,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test2: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1221,7 +1288,7 @@ ; ; AVX-LABEL: combine_test3: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1236,7 +1303,7 @@ ; ; AVX-LABEL: combine_test4: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1261,10 +1328,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test5: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test5: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test5: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1278,7 +1350,7 @@ ; ; AVX-LABEL: combine_test6: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -1303,10 +1375,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test7: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1320,7 +1397,7 @@ ; ; AVX-LABEL: combine_test8: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -1336,7 +1413,7 @@ ; ; AVX-LABEL: combine_test9: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -1361,10 +1438,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test10: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test10: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test10: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1397,10 +1479,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test12: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test12: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test12: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -1414,7 +1501,7 @@ ; ; AVX-LABEL: combine_test13: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -1429,7 +1516,7 @@ ; ; AVX-LABEL: combine_test14: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -1454,10 +1541,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test15: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test15: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test15: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -1490,10 +1582,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test17: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test17: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> ret <4 x i32> %2 @@ -1507,7 +1604,7 @@ ; ; AVX-LABEL: combine_test18: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> @@ -1522,7 +1619,7 @@ ; ; AVX-LABEL: combine_test19: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> @@ -1547,10 +1644,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test20: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test20: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test20: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> ret <4 x i32> %2 @@ -1565,14 +1667,23 @@ ; SSE-NEXT: movaps %xmm2, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test21: -; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: vmovaps %xmm2, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: combine_test21: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: vmovdqa %xmm2, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test21: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: vmovdqa %xmm2, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> store <4 x i32> %1, <4 x i32>* %ptr, align 16 @@ -1607,7 +1718,7 @@ ; ; AVX-LABEL: combine_test23: ; AVX: # %bb.0: -; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: vmovdqu %xmm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1 @@ -1630,7 +1741,7 @@ ; ; AVX-LABEL: combine_test1b: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,2,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1654,10 +1765,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test2b: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test2b: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test2b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1682,11 +1798,17 @@ ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test3b: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_test3b: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test3b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -1701,7 +1823,7 @@ ; ; AVX-LABEL: combine_test4b: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1885,10 +2007,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_blend_01: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_blend_01: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_blend_01: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle6 @@ -1914,10 +2041,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_blend_02: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_blend_02: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_blend_02: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle6 @@ -1941,10 +2073,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_blend_123: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_blend_123: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_blend_123: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> @@ -1960,7 +2097,7 @@ ; ; AVX-LABEL: combine_test_movhl_1: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -1976,7 +2113,7 @@ ; ; AVX-LABEL: combine_test_movhl_2: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -1992,7 +2129,7 @@ ; ; AVX-LABEL: combine_test_movhl_3: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -2019,10 +2156,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test1: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test1: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -2036,7 +2178,7 @@ ; ; AVX-LABEL: combine_undef_input_test2: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2051,7 +2193,7 @@ ; ; AVX-LABEL: combine_undef_input_test3: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2066,7 +2208,7 @@ ; ; AVX-LABEL: combine_undef_input_test4: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2089,10 +2231,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test5: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test5: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test5: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> ret <4 x float> %2 @@ -2127,10 +2274,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test7: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test7: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -2152,10 +2304,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test8: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> ret <4 x float> %2 @@ -2201,10 +2358,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test11: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test11: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test11: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2218,7 +2380,7 @@ ; ; AVX-LABEL: combine_undef_input_test12: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> @@ -2233,7 +2395,7 @@ ; ; AVX-LABEL: combine_undef_input_test13: ; AVX: # %bb.0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> @@ -2248,7 +2410,7 @@ ; ; AVX-LABEL: combine_undef_input_test14: ; AVX: # %bb.0: -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> @@ -2271,10 +2433,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test15: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test15: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test15: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2315,10 +2482,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test17: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test17: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test17: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2340,10 +2512,15 @@ ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_undef_input_test18: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: combine_undef_input_test18: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_undef_input_test18: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> ret <4 x float> %2 @@ -2567,8 +2744,8 @@ ; ; AVX-LABEL: combine_scalar_load_with_blend_with_zero: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq %1 = load double, double* %a0, align 8 %2 = insertelement <2 x double> undef, double %1, i32 0 @@ -2600,10 +2777,15 @@ ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_constant_insertion_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: combine_constant_insertion_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_constant_insertion_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX2-NEXT: retq %a0 = insertelement <4 x float> undef, float %f, i32 0 %ret = shufflevector <4 x float> %a0, <4 x float> , <4 x i32> ret <4 x float> %ret @@ -3217,8 +3399,8 @@ ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rdi) -; AVX1-NEXT: vmovups %ymm1, 32(%rdi) +; AVX1-NEXT: vmovdqu %ymm0, (%rdi) +; AVX1-NEXT: vmovdqu %ymm1, 32(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -248,7 +248,7 @@ ; ; BTVER2-LABEL: insertqi_len0_idx0: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps %xmm1, %xmm0 +; BTVER2-NEXT: vmovdqa %xmm1, %xmm0 ; BTVER2-NEXT: retq %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 0, i8 0) ret <2 x i64> %1 Index: llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -23,7 +23,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: andl $1, %esi ; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq @@ -52,11 +52,11 @@ ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $1, %esi -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq %x0 = extractelement <2 x i64> %x, i32 %i0 %x1 = extractelement <2 x i64> %x, i32 %i1 @@ -132,7 +132,7 @@ ; AVX-NEXT: andl $3, %edi ; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] @@ -217,7 +217,7 @@ ; AVX-NEXT: andl $3, %edi ; AVX-NEXT: andl $3, %esi ; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0 @@ -371,7 +371,7 @@ ; AVX-NEXT: andl $7, %edx ; AVX-NEXT: andl $7, %ecx ; AVX-NEXT: andl $7, %r8d -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $7, %r9d ; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %edi ; AVX-NEXT: vmovd %edi, %xmm0 @@ -634,7 +634,7 @@ ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $15, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzbl -24(%rsp,%rdi), %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: andl $15, %esi @@ -785,7 +785,7 @@ ; AVX-NEXT: andl $3, %edx ; AVX-NEXT: movl 12(%rdi), %esi ; AVX-NEXT: andl $3, %esi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0 ; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0 @@ -1101,7 +1101,7 @@ ; AVX-NEXT: movzbl 13(%rdi), %ebp ; AVX-NEXT: movzbl 14(%rdi), %eax ; AVX-NEXT: movzbl 15(%rdi), %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzbl -24(%rsp,%r9), %r9d ; AVX-NEXT: vmovd %r9d, %xmm0 ; AVX-NEXT: andl $15, %ebx @@ -1270,9 +1270,9 @@ ; AVX-NEXT: # kill: def $edx killed $edx def $rdx ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $3, %edi -; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %edx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -1399,13 +1399,13 @@ ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: # kill: def $edi killed $edi def $rdi ; AVX-NEXT: andl $7, %edi -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; AVX-NEXT: andl $7, %esi ; AVX-NEXT: andl $7, %edx ; AVX-NEXT: andl $7, %ecx ; AVX-NEXT: andl $7, %r8d -; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $7, %r9d ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 Index: llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -17,7 +17,7 @@ ; ALL-NEXT: andl $3, %edi ; ALL-NEXT: andl $3, %ecx ; ALL-NEXT: andl $3, %edx -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovdqa %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero @@ -38,21 +38,37 @@ } define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovdqa %ymm0, (%rsp) +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vpbroadcastq (%rsp,%rsi,8), %xmm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %x0 = extractelement <4 x double> %x, i64 %i0 %x1 = extractelement <4 x double> %x, i64 %i1 %x2 = extractelement <4 x double> %x, i64 %i2 @@ -71,7 +87,7 @@ ; ALL-NEXT: andl $1, %edi ; ALL-NEXT: andl $1, %ecx ; ALL-NEXT: andl $1, %edx -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero @@ -90,27 +106,49 @@ } define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: andl $3, %edi -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: andl $3, %edi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: vmovdqa %ymm0, (%rsp) +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %x0 = extractelement <4 x i64> %x, i64 %i0 %x1 = extractelement <4 x i64> %x, i64 %i1 %x2 = extractelement <4 x i64> %x, i64 %i2 @@ -131,10 +169,10 @@ ; ALL-NEXT: subq $64, %rsp ; ALL-NEXT: andl $3, %edi ; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vmovdqa %ymm0, (%rsp) +; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; ALL-NEXT: movq %rbp, %rsp ; ALL-NEXT: popq %rbp ; ALL-NEXT: retq @@ -150,21 +188,37 @@ } define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind { -; ALL-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: andl $1, %edi -; ALL-NEXT: andl $1, %esi -; ALL-NEXT: andl $1, %edx -; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq %x0 = extractelement <2 x i64> %x, i64 %i0 %x1 = extractelement <2 x i64> %x, i64 %i1 %x2 = extractelement <2 x i64> %x, i64 %i2 @@ -198,7 +252,7 @@ ; ALL-NEXT: andl $7, %edx ; ALL-NEXT: andl $7, %ecx ; ALL-NEXT: andl $7, %r8d -; ALL-NEXT: vmovaps %ymm0, (%rsp) +; ALL-NEXT: vmovdqa %ymm0, (%rsp) ; ALL-NEXT: andl $7, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] @@ -249,7 +303,7 @@ ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %ecx ; ALL-NEXT: andl $3, %r8d -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; ALL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: andl $3, %r9d ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] @@ -294,7 +348,7 @@ ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1-NEXT: andl $15, %edi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) +; AVX1-NEXT: vmovdqa %ymm0, (%rsp) ; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: andl $15, %esi @@ -356,7 +410,7 @@ ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: andl $15, %edi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) ; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: andl $15, %esi @@ -449,7 +503,7 @@ ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi ; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: andl $7, %esi @@ -505,7 +559,7 @@ ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi ; AVX2-NEXT: andl $7, %edi -; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: andl $7, %esi @@ -591,31 +645,57 @@ ; define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind { -; ALL-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp -; ALL-NEXT: movq %rsp, %rbp -; ALL-NEXT: andq $-32, %rsp -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: andl $3, %eax -; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx -; ALL-NEXT: andl $3, %edx -; ALL-NEXT: movq 24(%rdi), %rsi -; ALL-NEXT: andl $3, %esi -; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: movq %rbp, %rsp -; ALL-NEXT: popq %rbp -; ALL-NEXT: retq +; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: movq 8(%rdi), %rcx +; AVX1-NEXT: andl $3, %eax +; AVX1-NEXT: andl $3, %ecx +; AVX1-NEXT: movq 16(%rdi), %rdx +; AVX1-NEXT: andl $3, %edx +; AVX1-NEXT: movq 24(%rdi), %rsi +; AVX1-NEXT: andl $3, %esi +; AVX1-NEXT: vmovdqa %ymm0, (%rsp) +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: andl $3, %eax +; AVX2-NEXT: andl $3, %ecx +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: movq 24(%rdi), %rsi +; AVX2-NEXT: andl $3, %esi +; AVX2-NEXT: vmovdqa %ymm0, (%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq %p0 = getelementptr inbounds i64, i64* %i, i32 0 %p1 = getelementptr inbounds i64, i64* %i, i32 1 %p2 = getelementptr inbounds i64, i64* %i, i32 2 @@ -636,25 +716,45 @@ } define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind { -; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: andl $1, %eax -; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx -; ALL-NEXT: andl $1, %edx -; ALL-NEXT: movq 24(%rdi), %rsi -; ALL-NEXT: andl $1, %esi -; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: retq +; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: movq 8(%rdi), %rcx +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movq 16(%rdi), %rdx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: movq 24(%rdi), %rsi +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: movq 8(%rdi), %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: movq 24(%rdi), %rsi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq %p0 = getelementptr inbounds i64, i64* %i, i32 0 %p1 = getelementptr inbounds i64, i64* %i, i32 1 %p2 = getelementptr inbounds i64, i64* %i, i32 2 Index: llvm/test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc-math.ll +++ llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2792,9 +2792,9 @@ ; ; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -3190,9 +3190,9 @@ ; ; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3555,9 +3555,9 @@ ; ; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -3953,9 +3953,9 @@ ; ; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4318,9 +4318,9 @@ ; ; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4716,9 +4716,9 @@ ; ; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-trunc.ll =================================================================== --- llvm/test/CodeGen/X86/vector-trunc.ll +++ llvm/test/CodeGen/X86/vector-trunc.ll @@ -34,10 +34,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32: @@ -73,10 +73,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_ashr: @@ -116,10 +116,10 @@ ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: @@ -1315,10 +1315,10 @@ ; ; AVX2-FAST-LABEL: trunc2x4i64_8i32: ; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc2x4i64_8i32: @@ -1810,12 +1810,12 @@ ; ; AVX-LABEL: trunc16i64_16i8_const: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: trunc16i64_16i8_const: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq entry: Index: llvm/test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -1581,27 +1581,27 @@ ; ; AVX-LABEL: foldv2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64: @@ -1620,27 +1620,27 @@ ; ; AVX-LABEL: foldv2i64u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64u: @@ -1659,27 +1659,27 @@ ; ; AVX-LABEL: foldv4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32: @@ -1698,27 +1698,27 @@ ; ; AVX-LABEL: foldv4i32u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv4i32u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv4i32u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i32u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i32u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,32,0] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32u: @@ -1737,27 +1737,27 @@ ; ; AVX-LABEL: foldv8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv8i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv8i16: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: @@ -1776,27 +1776,27 @@ ; ; AVX-LABEL: foldv8i16u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv8i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv8i16u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv8i16u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv8i16u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: @@ -1815,27 +1815,27 @@ ; ; AVX-LABEL: foldv16i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv16i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv16i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: @@ -1854,27 +1854,27 @@ ; ; AVX-LABEL: foldv16i8u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv16i8u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv16i8u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv16i8u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv16i8u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; BITALG-NEXT: vmovdqa {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: Index: llvm/test/CodeGen/X86/vector-tzcnt-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -1114,22 +1114,22 @@ define <4 x i64> @foldv4i64() nounwind { ; AVX-LABEL: foldv4i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 0) ret <4 x i64> %out @@ -1138,22 +1138,22 @@ define <4 x i64> @foldv4i64u() nounwind { ; AVX-LABEL: foldv4i64u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; AVX-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv4i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv4i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; BITALG-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,64,0] ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> , i1 -1) ret <4 x i64> %out @@ -1162,7 +1162,7 @@ define <8 x i32> @foldv8i32() nounwind { ; ALL-LABEL: foldv8i32: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] ; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out @@ -1171,7 +1171,7 @@ define <8 x i32> @foldv8i32u() nounwind { ; ALL-LABEL: foldv8i32u: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] ; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out @@ -1180,7 +1180,7 @@ define <16 x i16> @foldv16i16() nounwind { ; ALL-LABEL: foldv16i16: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] ; ALL-NEXT: ret{{[l|q]}} %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out @@ -1189,7 +1189,7 @@ define <16 x i16> @foldv16i16u() nounwind { ; ALL-LABEL: foldv16i16u: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] ; ALL-NEXT: ret{{[l|q]}} %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out @@ -1198,7 +1198,7 @@ define <32 x i8> @foldv32i8() nounwind { ; ALL-LABEL: foldv32i8: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] ; ALL-NEXT: ret{{[l|q]}} %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out @@ -1207,7 +1207,7 @@ define <32 x i8> @foldv32i8u() nounwind { ; ALL-LABEL: foldv32i8u: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; ALL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] ; ALL-NEXT: ret{{[l|q]}} %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out Index: llvm/test/CodeGen/X86/vector-width-store-merge.ll =================================================================== --- llvm/test/CodeGen/X86/vector-width-store-merge.ll +++ llvm/test/CodeGen/X86/vector-width-store-merge.ll @@ -8,10 +8,10 @@ define weak_odr dso_local void @A(i8* %src, i8* %dst) local_unnamed_addr #0 { ; CHECK-LABEL: A: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %xmm0 -; CHECK-NEXT: vmovups 16(%rdi), %xmm1 -; CHECK-NEXT: vmovups %xmm1, 16(%rsi) -; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu 16(%rdi), %xmm1 +; CHECK-NEXT: vmovdqu %xmm1, 16(%rsi) +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ; CHECK-NEXT: retq entry: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) @@ -22,14 +22,14 @@ define weak_odr dso_local void @B(i8* %src, i8* %dst) local_unnamed_addr #0 { ; CHECK-LABEL: B: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %xmm0 -; CHECK-NEXT: vmovups 16(%rdi), %xmm1 -; CHECK-NEXT: vmovups 32(%rdi), %xmm2 -; CHECK-NEXT: vmovups 48(%rdi), %xmm3 -; CHECK-NEXT: vmovups %xmm3, 48(%rsi) -; CHECK-NEXT: vmovups %xmm2, 32(%rsi) -; CHECK-NEXT: vmovups %xmm1, 16(%rsi) -; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %xmm0 +; CHECK-NEXT: vmovdqu 16(%rdi), %xmm1 +; CHECK-NEXT: vmovdqu 32(%rdi), %xmm2 +; CHECK-NEXT: vmovdqu 48(%rdi), %xmm3 +; CHECK-NEXT: vmovdqu %xmm3, 48(%rsi) +; CHECK-NEXT: vmovdqu %xmm2, 32(%rsi) +; CHECK-NEXT: vmovdqu %xmm1, 16(%rsi) +; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ; CHECK-NEXT: retq entry: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) @@ -40,8 +40,8 @@ define weak_odr dso_local void @C(i8* %src, i8* %dst) local_unnamed_addr #2 { ; CHECK-LABEL: C: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %ymm0 -; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: @@ -53,10 +53,10 @@ define weak_odr dso_local void @D(i8* %src, i8* %dst) local_unnamed_addr #2 { ; CHECK-LABEL: D: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovups (%rdi), %ymm0 -; CHECK-NEXT: vmovups 32(%rdi), %ymm1 -; CHECK-NEXT: vmovups %ymm1, 32(%rsi) -; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 +; CHECK-NEXT: vmovdqu 32(%rdi), %ymm1 +; CHECK-NEXT: vmovdqu %ymm1, 32(%rsi) +; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: Index: llvm/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/test/CodeGen/X86/vector-zext.ll +++ llvm/test/CodeGen/X86/vector-zext.ll @@ -130,7 +130,7 @@ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_32i8_to_32i16: @@ -294,7 +294,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_16i32: @@ -445,7 +445,7 @@ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_8i64: @@ -587,7 +587,7 @@ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i16_to_16i32: @@ -743,7 +743,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_8i64: @@ -884,7 +884,7 @@ ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vmovdqa %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i32_to_8i64: @@ -2080,8 +2080,8 @@ ; ; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: retq entry: %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> @@ -2252,7 +2252,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vmovaps %ymm4, %ymm0 +; AVX1-NEXT: vmovdqa %ymm4, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_32i8_to_32i32: Index: llvm/test/CodeGen/X86/vector-zmov.ll =================================================================== --- llvm/test/CodeGen/X86/vector-zmov.ll +++ llvm/test/CodeGen/X86/vector-zmov.ll @@ -13,7 +13,7 @@ ; ; AVX-LABEL: load_zmov_4i32_to_0zzz: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq entry: %X = load <4 x i32>, <4 x i32>* %ptr @@ -29,7 +29,7 @@ ; ; AVX-LABEL: load_zmov_2i64_to_0z: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: retq entry: %X = load <2 x i64>, <2 x i64>* %ptr @@ -61,9 +61,9 @@ ; ; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: retq entry: %X = load volatile <4 x i32>, <4 x i32>* %ptr Index: llvm/test/CodeGen/X86/vzero-excess.ll =================================================================== --- llvm/test/CodeGen/X86/vzero-excess.ll +++ llvm/test/CodeGen/X86/vzero-excess.ll @@ -9,7 +9,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $48, %rsp -; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq the_unknown @@ -35,10 +35,10 @@ ; CHECK-LABEL: zeroupper_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq the_unknown -; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: retq call void @llvm.x86.avx.vzeroupper() @@ -51,7 +51,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $48, %rsp -; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vzeroall ; CHECK-NEXT: callq the_unknown @@ -77,10 +77,10 @@ ; CHECK-LABEL: zeroall_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; CHECK-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; CHECK-NEXT: vzeroall ; CHECK-NEXT: callq the_unknown -; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: retq call void @llvm.x86.avx.vzeroall() Index: llvm/test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -147,35 +147,35 @@ ; ; AVX2-LABEL: store_factorf64_4: ; AVX2: # %bb.0: -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm3, 64(%rdi) -; AVX2-NEXT: vmovups %ymm4, 32(%rdi) -; AVX2-NEXT: vmovups %ymm2, (%rdi) +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_factorf64_4: ; AVX512: # %bb.0: -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512-NEXT: vmovups %zmm1, (%rdi) +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> @@ -205,35 +205,35 @@ ; ; AVX2-LABEL: store_factori64_4: ; AVX2: # %bb.0: -; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vmovups %ymm0, 96(%rdi) -; AVX2-NEXT: vmovups %ymm3, 64(%rdi) -; AVX2-NEXT: vmovups %ymm4, 32(%rdi) -; AVX2-NEXT: vmovups %ymm2, (%rdi) +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi) +; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_factori64_4: ; AVX512: # %bb.0: -; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vmovups %zmm0, 64(%rdi) -; AVX512-NEXT: vmovups %zmm1, (%rdi) +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rdi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> @@ -271,10 +271,10 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) -; AVX1-NEXT: vmovaps %ymm3, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm2, (%rdi) +; AVX1-NEXT: vmovdqa %ymm0, 96(%rdi) +; AVX1-NEXT: vmovdqa %ymm3, 64(%rdi) +; AVX1-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX1-NEXT: vmovdqa %ymm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -709,7 +709,7 @@ ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm15, %xmm1 -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 @@ -1560,17 +1560,17 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm14, %ymm3 -; AVX1-NEXT: vmovaps %ymm3, 224(%rdi) -; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) -; AVX1-NEXT: vmovaps %ymm7, 160(%rdi) -; AVX1-NEXT: vmovaps %ymm6, 128(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 96(%rdi) -; AVX1-NEXT: vmovaps %ymm2, 64(%rdi) -; AVX1-NEXT: vmovaps %ymm4, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm8, (%rdi) +; AVX1-NEXT: vmovdqa %ymm3, 224(%rdi) +; AVX1-NEXT: vmovdqa %ymm0, 192(%rdi) +; AVX1-NEXT: vmovdqa %ymm7, 160(%rdi) +; AVX1-NEXT: vmovdqa %ymm6, 128(%rdi) +; AVX1-NEXT: vmovdqa %ymm1, 96(%rdi) +; AVX1-NEXT: vmovdqa %ymm2, 64(%rdi) +; AVX1-NEXT: vmovdqa %ymm4, 32(%rdi) +; AVX1-NEXT: vmovdqa %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1665,20 +1665,20 @@ ; ; AVX2-LABEL: splat2_v4f64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 32(%rsi) -; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat2_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, <4 x double>* %s, align 8 @@ -1703,20 +1703,20 @@ ; ; AVX2-LABEL: splat2_v4i64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 32(%rsi) -; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat2_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, <4 x i64>* %s, align 8 @@ -1749,20 +1749,20 @@ ; ; AVX2-LABEL: splat4_v8f32_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 96(%rsi) -; AVX2-NEXT: vmovups %ymm3, 64(%rsi) -; AVX2-NEXT: vmovups %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu (%rdi), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1800,29 +1800,29 @@ ; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm3 ; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vmovups %ymm3, 96(%rsi) -; AVX1-NEXT: vmovups %ymm2, 64(%rsi) -; AVX1-NEXT: vmovups %ymm1, 32(%rsi) -; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vmovdqu %ymm3, 96(%rsi) +; AVX1-NEXT: vmovdqu %ymm2, 64(%rsi) +; AVX1-NEXT: vmovdqu %ymm1, 32(%rsi) +; AVX1-NEXT: vmovdqu %ymm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat4_v8i32_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-NEXT: vmovups %ymm0, 96(%rsi) -; AVX2-NEXT: vmovups %ymm3, 64(%rsi) -; AVX2-NEXT: vmovups %ymm1, 32(%rsi) -; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu (%rdi), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovdqu %ymm0, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1861,27 +1861,27 @@ ; ; AVX2-LABEL: splat4_v4f64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, 96(%rsi) -; AVX2-NEXT: vmovups %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups %ymm2, 32(%rsi) -; AVX2-NEXT: vmovups %ymm0, (%rsi) +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat4_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX512-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX512-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, <4 x double>* %s, align 8 @@ -1908,27 +1908,27 @@ ; ; AVX2-LABEL: splat4_v4i64_load_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, 96(%rsi) -; AVX2-NEXT: vmovups %ymm1, 64(%rsi) -; AVX2-NEXT: vmovups %ymm2, 32(%rsi) -; AVX2-NEXT: vmovups %ymm0, (%rsi) +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX2-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX2-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX2-NEXT: vmovdqu %ymm3, 96(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi) +; AVX2-NEXT: vmovdqu %ymm2, 32(%rsi) +; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: splat4_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512-NEXT: vpbroadcastq 16(%rdi), %ymm1 +; AVX512-NEXT: vpbroadcastq 8(%rdi), %ymm2 +; AVX512-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, <4 x i64>* %s, align 8 Index: llvm/test/CodeGen/X86/x86-interrupt_cc.ll =================================================================== --- llvm/test/CodeGen/X86/x86-interrupt_cc.ll +++ llvm/test/CodeGen/X86/x86-interrupt_cc.ll @@ -29,86 +29,86 @@ ; CHECK64-KNL-NEXT: .cfi_def_cfa_offset 80 ; CHECK64-KNL-NEXT: subq $2096, %rsp ## encoding: [0x48,0x81,0xec,0x30,0x08,0x00,0x00] ; CHECK64-KNL-NEXT: ## imm = 0x830 -; CHECK64-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xbc,0x24,0x2e,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xb4,0x24,0x2c,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xac,0x24,0x2a,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xa4,0x24,0x28,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k3, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x9c,0x24,0x26,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k2, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x94,0x24,0x24,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x8c,0x24,0x22,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill +; CHECK64-KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x84,0x24,0x20,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x7c,0x24,0x1f] -; CHECK64-KNL-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x74,0x24,0x1e] -; CHECK64-KNL-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x6c,0x24,0x1d] -; CHECK64-KNL-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x64,0x24,0x1c] -; CHECK64-KNL-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x5c,0x24,0x1b] -; CHECK64-KNL-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x54,0x24,0x1a] -; CHECK64-KNL-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x4c,0x24,0x19] -; CHECK64-KNL-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x44,0x24,0x18] -; CHECK64-KNL-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x7c,0x24,0x17] -; CHECK64-KNL-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x74,0x24,0x16] -; CHECK64-KNL-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x6c,0x24,0x15] -; CHECK64-KNL-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x64,0x24,0x14] -; CHECK64-KNL-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x5c,0x24,0x13] -; CHECK64-KNL-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x54,0x24,0x12] -; CHECK64-KNL-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x4c,0x24,0x11] -; CHECK64-KNL-NEXT: vmovups %zmm16, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x44,0x24,0x10] -; CHECK64-KNL-NEXT: vmovups %zmm15, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x7c,0x24,0x0f] -; CHECK64-KNL-NEXT: vmovups %zmm14, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x74,0x24,0x0e] -; CHECK64-KNL-NEXT: vmovups %zmm13, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x6c,0x24,0x0d] -; CHECK64-KNL-NEXT: vmovups %zmm12, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x64,0x24,0x0c] -; CHECK64-KNL-NEXT: vmovups %zmm11, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x5c,0x24,0x0b] -; CHECK64-KNL-NEXT: vmovups %zmm10, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x54,0x24,0x0a] -; CHECK64-KNL-NEXT: vmovups %zmm9, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x4c,0x24,0x09] -; CHECK64-KNL-NEXT: vmovups %zmm8, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x44,0x24,0x08] -; CHECK64-KNL-NEXT: vmovups %zmm7, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK64-KNL-NEXT: vmovups %zmm6, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK64-KNL-NEXT: vmovups %zmm5, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK64-KNL-NEXT: vmovups %zmm4, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK64-KNL-NEXT: vmovups %zmm3, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK64-KNL-NEXT: vmovups %zmm2, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK64-KNL-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK64-KNL-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x7c,0x24,0x1f] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x74,0x24,0x1e] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x6c,0x24,0x1d] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x64,0x24,0x1c] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x5c,0x24,0x1b] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x54,0x24,0x1a] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x4c,0x24,0x19] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x44,0x24,0x18] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x7c,0x24,0x17] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x74,0x24,0x16] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x6c,0x24,0x15] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x64,0x24,0x14] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x5c,0x24,0x13] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x54,0x24,0x12] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x4c,0x24,0x11] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x44,0x24,0x10] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x7c,0x24,0x0f] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x74,0x24,0x0e] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x6c,0x24,0x0d] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x64,0x24,0x0c] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x5c,0x24,0x0b] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x54,0x24,0x0a] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x4c,0x24,0x09] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x44,0x24,0x08] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK64-KNL-NEXT: vmovdqu64 %zmm0, (%rsp) ## 64-byte Spill +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK64-KNL-NEXT: .cfi_def_cfa_offset 2176 ; CHECK64-KNL-NEXT: .cfi_offset %rcx, -80 ; CHECK64-KNL-NEXT: .cfi_offset %rdx, -72 @@ -162,85 +162,85 @@ ; CHECK64-KNL-NEXT: cld ## encoding: [0xfc] ; CHECK64-KNL-NEXT: callq _bar ## encoding: [0xe8,A,A,A,A] ; CHECK64-KNL-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: reloc_branch_4byte_pcrel -; CHECK64-KNL-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm1 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm2 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm3 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm4 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm5 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm6 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm7 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm8 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x44,0x24,0x08] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm9 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x4c,0x24,0x09] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm10 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x54,0x24,0x0a] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm11 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x5c,0x24,0x0b] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm12 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x64,0x24,0x0c] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm13 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x6c,0x24,0x0d] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm14 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x74,0x24,0x0e] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm15 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x7c,0x24,0x0f] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm16 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x44,0x24,0x10] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x4c,0x24,0x11] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x54,0x24,0x12] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x5c,0x24,0x13] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x64,0x24,0x14] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x6c,0x24,0x15] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x74,0x24,0x16] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x7c,0x24,0x17] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x44,0x24,0x18] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x4c,0x24,0x19] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x54,0x24,0x1a] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x5c,0x24,0x1b] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x64,0x24,0x1c] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x6c,0x24,0x1d] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x74,0x24,0x1e] -; CHECK64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x7c,0x24,0x1f] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload +; CHECK64-KNL-NEXT: vmovdqu64 (%rsp), %zmm0 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x44,0x24,0x08] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x4c,0x24,0x09] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x54,0x24,0x0a] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x5c,0x24,0x0b] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x64,0x24,0x0c] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x6c,0x24,0x0d] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x74,0x24,0x0e] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x7c,0x24,0x0f] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x44,0x24,0x10] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x4c,0x24,0x11] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x54,0x24,0x12] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x5c,0x24,0x13] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x64,0x24,0x14] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x6c,0x24,0x15] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x74,0x24,0x16] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x7c,0x24,0x17] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x44,0x24,0x18] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x4c,0x24,0x19] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x54,0x24,0x1a] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x5c,0x24,0x1b] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x64,0x24,0x1c] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x6c,0x24,0x1d] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x74,0x24,0x1e] +; CHECK64-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; CHECK64-KNL-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x7c,0x24,0x1f] +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x84,0x24,0x20,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x8c,0x24,0x22,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x94,0x24,0x24,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k3 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x9c,0x24,0x26,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xa4,0x24,0x28,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xac,0x24,0x2a,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xb4,0x24,0x2c,0x08,0x00,0x00] -; CHECK64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload +; CHECK64-KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload ; CHECK64-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xbc,0x24,0x2e,0x08,0x00,0x00] ; CHECK64-KNL-NEXT: addq $2096, %rsp ## encoding: [0x48,0x81,0xc4,0x30,0x08,0x00,0x00] ; CHECK64-KNL-NEXT: ## imm = 0x830 @@ -277,86 +277,86 @@ ; CHECK64-SKX-NEXT: .cfi_def_cfa_offset 80 ; CHECK64-SKX-NEXT: subq $2160, %rsp ## encoding: [0x48,0x81,0xec,0x70,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: ## imm = 0x870 -; CHECK64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xb4,0x24,0x60,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xac,0x24,0x58,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xa4,0x24,0x50,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k3, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x9c,0x24,0x48,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k2, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x94,0x24,0x40,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k1, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x8c,0x24,0x38,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK64-SKX-NEXT: kmovq %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x7c,0x24,0x1f] -; CHECK64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x74,0x24,0x1e] -; CHECK64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x6c,0x24,0x1d] -; CHECK64-SKX-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x64,0x24,0x1c] -; CHECK64-SKX-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x5c,0x24,0x1b] -; CHECK64-SKX-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x54,0x24,0x1a] -; CHECK64-SKX-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x4c,0x24,0x19] -; CHECK64-SKX-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x44,0x24,0x18] -; CHECK64-SKX-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x7c,0x24,0x17] -; CHECK64-SKX-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x74,0x24,0x16] -; CHECK64-SKX-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x6c,0x24,0x15] -; CHECK64-SKX-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x64,0x24,0x14] -; CHECK64-SKX-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x5c,0x24,0x13] -; CHECK64-SKX-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x54,0x24,0x12] -; CHECK64-SKX-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x4c,0x24,0x11] -; CHECK64-SKX-NEXT: vmovups %zmm16, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x11,0x44,0x24,0x10] -; CHECK64-SKX-NEXT: vmovups %zmm15, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x7c,0x24,0x0f] -; CHECK64-SKX-NEXT: vmovups %zmm14, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x74,0x24,0x0e] -; CHECK64-SKX-NEXT: vmovups %zmm13, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x6c,0x24,0x0d] -; CHECK64-SKX-NEXT: vmovups %zmm12, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x64,0x24,0x0c] -; CHECK64-SKX-NEXT: vmovups %zmm11, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x5c,0x24,0x0b] -; CHECK64-SKX-NEXT: vmovups %zmm10, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x54,0x24,0x0a] -; CHECK64-SKX-NEXT: vmovups %zmm9, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x4c,0x24,0x09] -; CHECK64-SKX-NEXT: vmovups %zmm8, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x11,0x44,0x24,0x08] -; CHECK64-SKX-NEXT: vmovups %zmm7, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK64-SKX-NEXT: vmovups %zmm6, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK64-SKX-NEXT: vmovups %zmm5, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK64-SKX-NEXT: vmovups %zmm4, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK64-SKX-NEXT: vmovups %zmm3, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK64-SKX-NEXT: vmovups %zmm2, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK64-SKX-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK64-SKX-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x7c,0x24,0x1f] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x74,0x24,0x1e] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x6c,0x24,0x1d] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x64,0x24,0x1c] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x5c,0x24,0x1b] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x54,0x24,0x1a] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x4c,0x24,0x19] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x7f,0x44,0x24,0x18] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x7c,0x24,0x17] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x74,0x24,0x16] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x6c,0x24,0x15] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x64,0x24,0x14] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x5c,0x24,0x13] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x54,0x24,0x12] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x4c,0x24,0x11] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x7f,0x44,0x24,0x10] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x7c,0x24,0x0f] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x74,0x24,0x0e] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x6c,0x24,0x0d] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x64,0x24,0x0c] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x5c,0x24,0x0b] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x54,0x24,0x0a] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x4c,0x24,0x09] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x7f,0x44,0x24,0x08] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK64-SKX-NEXT: vmovdqu64 %zmm0, (%rsp) ## 64-byte Spill +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK64-SKX-NEXT: .cfi_def_cfa_offset 2240 ; CHECK64-SKX-NEXT: .cfi_offset %rcx, -80 ; CHECK64-SKX-NEXT: .cfi_offset %rdx, -72 @@ -411,85 +411,85 @@ ; CHECK64-SKX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK64-SKX-NEXT: callq _bar ## encoding: [0xe8,A,A,A,A] ; CHECK64-SKX-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: reloc_branch_4byte_pcrel -; CHECK64-SKX-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm1 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm2 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm3 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm4 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm5 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm6 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm7 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm8 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x44,0x24,0x08] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm9 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x4c,0x24,0x09] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm10 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x54,0x24,0x0a] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm11 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x5c,0x24,0x0b] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm12 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x64,0x24,0x0c] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm13 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x6c,0x24,0x0d] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm14 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x74,0x24,0x0e] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm15 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0x7c,0x48,0x10,0x7c,0x24,0x0f] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm16 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x44,0x24,0x10] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x4c,0x24,0x11] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x54,0x24,0x12] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x5c,0x24,0x13] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x64,0x24,0x14] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x6c,0x24,0x15] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x74,0x24,0x16] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0x7c,0x48,0x10,0x7c,0x24,0x17] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x44,0x24,0x18] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x4c,0x24,0x19] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x54,0x24,0x1a] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x5c,0x24,0x1b] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x64,0x24,0x1c] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x6c,0x24,0x1d] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x74,0x24,0x1e] -; CHECK64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload -; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x7c,0x24,0x1f] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload +; CHECK64-SKX-NEXT: vmovdqu64 (%rsp), %zmm0 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x44,0x24,0x08] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x4c,0x24,0x09] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x54,0x24,0x0a] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x5c,0x24,0x0b] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x64,0x24,0x0c] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x6c,0x24,0x0d] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x74,0x24,0x0e] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x71,0xfe,0x48,0x6f,0x7c,0x24,0x0f] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x44,0x24,0x10] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x4c,0x24,0x11] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x54,0x24,0x12] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x5c,0x24,0x13] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x64,0x24,0x14] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x6c,0x24,0x15] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x74,0x24,0x16] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0xe1,0xfe,0x48,0x6f,0x7c,0x24,0x17] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x44,0x24,0x18] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x4c,0x24,0x19] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x54,0x24,0x1a] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x5c,0x24,0x1b] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x64,0x24,0x1c] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x6c,0x24,0x1d] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x74,0x24,0x1e] +; CHECK64-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload +; CHECK64-SKX-NEXT: ## encoding: [0x62,0x61,0xfe,0x48,0x6f,0x7c,0x24,0x1f] +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x8c,0x24,0x38,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k2 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x94,0x24,0x40,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k3 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x9c,0x24,0x48,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xa4,0x24,0x50,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xac,0x24,0x58,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xb4,0x24,0x60,0x08,0x00,0x00] -; CHECK64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload +; CHECK64-SKX-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload ; CHECK64-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xbc,0x24,0x68,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: addq $2160, %rsp ## encoding: [0x48,0x81,0xc4,0x70,0x08,0x00,0x00] ; CHECK64-SKX-NEXT: ## imm = 0x870 @@ -514,38 +514,38 @@ ; CHECK32-KNL-NEXT: .cfi_def_cfa_offset 16 ; CHECK32-KNL-NEXT: subl $560, %esp ## encoding: [0x81,0xec,0x30,0x02,0x00,0x00] ; CHECK32-KNL-NEXT: ## imm = 0x230 -; CHECK32-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xbc,0x24,0x2e,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xb4,0x24,0x2c,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xac,0x24,0x2a,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0xa4,0x24,0x28,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k3, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x9c,0x24,0x26,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k2, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x94,0x24,0x24,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k1, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x8c,0x24,0x22,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw %k0, {{[0-9]+}}(%esp) ## 2-byte Spill +; CHECK32-KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x91,0x84,0x24,0x20,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK32-KNL-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK32-KNL-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK32-KNL-NEXT: vmovups %zmm4, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK32-KNL-NEXT: vmovups %zmm3, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK32-KNL-NEXT: vmovups %zmm2, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK32-KNL-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK32-KNL-NEXT: vmovups %zmm0, (%esp) ## 64-byte Spill -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK32-KNL-NEXT: vmovdqu64 %zmm0, (%esp) ## 64-byte Spill +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK32-KNL-NEXT: .cfi_def_cfa_offset 576 ; CHECK32-KNL-NEXT: .cfi_offset %eax, -16 ; CHECK32-KNL-NEXT: .cfi_offset %ecx, -12 @@ -569,37 +569,37 @@ ; CHECK32-KNL-NEXT: cld ## encoding: [0xfc] ; CHECK32-KNL-NEXT: calll _bar ## encoding: [0xe8,A,A,A,A] ; CHECK32-KNL-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4 -; CHECK32-KNL-NEXT: vmovups (%esp), %zmm0 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm1 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm2 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm3 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm4 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm5 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm6 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK32-KNL-NEXT: vmovups {{[0-9]+}}(%esp), %zmm7 ## 64-byte Reload -; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## 2-byte Reload +; CHECK32-KNL-NEXT: vmovdqu64 (%esp), %zmm0 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK32-KNL-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK32-KNL-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x84,0x24,0x20,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x8c,0x24,0x22,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k2 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x94,0x24,0x24,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k3 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0x9c,0x24,0x26,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k4 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k4 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xa4,0x24,0x28,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k5 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k5 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xac,0x24,0x2a,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k6 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xb4,0x24,0x2c,0x02,0x00,0x00] -; CHECK32-KNL-NEXT: kmovw {{[0-9]+}}(%esp), %k7 ## 2-byte Reload +; CHECK32-KNL-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload ; CHECK32-KNL-NEXT: ## encoding: [0xc5,0xf8,0x90,0xbc,0x24,0x2e,0x02,0x00,0x00] ; CHECK32-KNL-NEXT: addl $560, %esp ## encoding: [0x81,0xc4,0x30,0x02,0x00,0x00] ; CHECK32-KNL-NEXT: ## imm = 0x230 @@ -618,38 +618,38 @@ ; CHECK32-SKX-NEXT: .cfi_def_cfa_offset 16 ; CHECK32-SKX-NEXT: subl $624, %esp ## encoding: [0x81,0xec,0x70,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: ## imm = 0x270 -; CHECK32-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k7, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k6, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xb4,0x24,0x60,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k5, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xac,0x24,0x58,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k4, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0xa4,0x24,0x50,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k3, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k3, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x9c,0x24,0x48,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k2, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x94,0x24,0x40,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k1, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x8c,0x24,0x38,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq %k0, {{[0-9]+}}(%esp) ## 8-byte Spill +; CHECK32-SKX-NEXT: kmovq %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 8-byte Spill ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: vmovups %zmm7, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07] -; CHECK32-SKX-NEXT: vmovups %zmm6, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06] -; CHECK32-SKX-NEXT: vmovups %zmm5, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x6c,0x24,0x05] -; CHECK32-SKX-NEXT: vmovups %zmm4, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x64,0x24,0x04] -; CHECK32-SKX-NEXT: vmovups %zmm3, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x5c,0x24,0x03] -; CHECK32-SKX-NEXT: vmovups %zmm2, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x54,0x24,0x02] -; CHECK32-SKX-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x4c,0x24,0x01] -; CHECK32-SKX-NEXT: vmovups %zmm0, (%esp) ## 64-byte Spill -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x07] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x06] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x05] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x04] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x03] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x02] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x01] +; CHECK32-SKX-NEXT: vmovdqu64 %zmm0, (%esp) ## 64-byte Spill +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x04,0x24] ; CHECK32-SKX-NEXT: .cfi_def_cfa_offset 640 ; CHECK32-SKX-NEXT: .cfi_offset %eax, -16 ; CHECK32-SKX-NEXT: .cfi_offset %ecx, -12 @@ -674,37 +674,37 @@ ; CHECK32-SKX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; CHECK32-SKX-NEXT: calll _bar ## encoding: [0xe8,A,A,A,A] ; CHECK32-SKX-NEXT: ## fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4 -; CHECK32-SKX-NEXT: vmovups (%esp), %zmm0 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x04,0x24] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm1 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x4c,0x24,0x01] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm2 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x54,0x24,0x02] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm3 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x5c,0x24,0x03] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm4 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x64,0x24,0x04] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm5 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x6c,0x24,0x05] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm6 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06] -; CHECK32-SKX-NEXT: vmovups {{[0-9]+}}(%esp), %zmm7 ## 64-byte Reload -; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ## 8-byte Reload +; CHECK32-SKX-NEXT: vmovdqu64 (%esp), %zmm0 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x04,0x24] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm1 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x01] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm2 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x02] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm3 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x03] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm4 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x04] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm5 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x05] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm6 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x06] +; CHECK32-SKX-NEXT: vmovdqu64 {{[-0-9]+}}(%e{{[sb]}}p), %zmm7 ## 64-byte Reload +; CHECK32-SKX-NEXT: ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x07] +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k0 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x8c,0x24,0x38,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k2 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x94,0x24,0x40,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k3 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k3 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0x9c,0x24,0x48,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k4 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k4 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xa4,0x24,0x50,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k5 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k5 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xac,0x24,0x58,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k6 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xb4,0x24,0x60,0x02,0x00,0x00] -; CHECK32-SKX-NEXT: kmovq {{[0-9]+}}(%esp), %k7 ## 8-byte Reload +; CHECK32-SKX-NEXT: kmovq {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 8-byte Reload ; CHECK32-SKX-NEXT: ## encoding: [0xc4,0xe1,0xf8,0x90,0xbc,0x24,0x68,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: addl $624, %esp ## encoding: [0x81,0xc4,0x70,0x02,0x00,0x00] ; CHECK32-SKX-NEXT: ## imm = 0x270 Index: llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll =================================================================== --- llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll +++ llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll @@ -9,8 +9,8 @@ ; CHECK-LABEL: broadcast128: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %1 = alloca <2 x i64>, align 16 %2 = bitcast <2 x i64>* %1 to i8* Index: llvm/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll =================================================================== --- llvm/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll +++ llvm/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll @@ -169,7 +169,7 @@ define <16 x i8> @test_int_x86_xop_vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalseb: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) ; ret <16 x i8> %res @@ -179,7 +179,7 @@ define <4 x i32> @test_int_x86_xop_vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalsed: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) ; ret <4 x i32> %res @@ -189,7 +189,7 @@ define <2 x i64> @test_int_x86_xop_vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalseq: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) ; ret <2 x i64> %res @@ -199,7 +199,7 @@ define <16 x i8> @test_int_x86_xop_vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalseub: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) ; ret <16 x i8> %res @@ -209,7 +209,7 @@ define <4 x i32> @test_int_x86_xop_vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalseud: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) ; ret <4 x i32> %res @@ -219,7 +219,7 @@ define <2 x i64> @test_int_x86_xop_vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalseuq: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) ; ret <2 x i64> %res @@ -229,7 +229,7 @@ define <8 x i16> @test_int_x86_xop_vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalseuw: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) ; ret <8 x i16> %res @@ -239,7 +239,7 @@ define <8 x i16> @test_int_x86_xop_vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomfalsew: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) ; ret <8 x i16> %res Index: llvm/test/CodeGen/X86/xop-mask-comments.ll =================================================================== --- llvm/test/CodeGen/X86/xop-mask-comments.ll +++ llvm/test/CodeGen/X86/xop-mask-comments.ll @@ -100,14 +100,14 @@ define <2 x double> @vpermil2pd_21(<2 x double> %a0, <2 x double> %a1) { ; X32-LABEL: vpermil2pd_21: ; X32: # %bb.0: -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X32-NEXT: retl ; ; X64-LABEL: vpermil2pd_21: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64-NEXT: retq %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> , i8 2) ret <2 x double> %1