Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4102,6 +4102,8 @@ isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + if (MemVTWidth == WidenWidth) + return MemVT; RetVT = MemVT; break; } @@ -4113,7 +4115,10 @@ VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) { EVT MemVT = (MVT::SimpleValueType) VT; unsigned MemVTWidth = MemVT.getSizeInBits(); - if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() && + auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT); + if ((Action == TargetLowering::TypeLegal || + Action == TargetLowering::TypePromoteInteger) && + WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -29,7 +29,8 @@ ; FUNC-LABEL: {{^}}constant_load_v3i16: ; GCN: s_load_dwordx2 s -; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 2, #1 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { entry: @@ -186,15 +187,11 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, ; EG: CF_END -; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1 +; EG-DAG: VTX_READ_16 {{T[0-9]\.[XYZW]}}, [[SRC]], 2, #1 +; EG-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1 +; EG-DAG: LSHR {{[* ]*}}{{T[0-9]\.[XYZW]}}, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 16 -; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal -; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal -; EG-DAG: 65535 -; EG-DAG: 65535 define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in @@ -209,11 +206,12 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, ; v3i16 is naturally 8 byte aligned -; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1 -; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 -; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal +; EG-DAG: VTX_READ_16 [[DST_LO:[[ST_LO]].X]], [[SRC:T[0-9]\.[XYZW]]], 0, #1 +; EG-DAG: VTX_READ_16 [[DST_MID:T[0-9]\.[XYZW]]], [[SRC]], 2, #1 +; EG-DAG: VTX_READ_16 [[DST_HI:[[ST_HI]].X]], [[SRC]], 4, #1 +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, [[DST_LO]], 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, [[DST_MID]], 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -34,7 +34,8 @@ ; GCN-NOHSA: buffer_load_dwordx2 v ; GCN-HSA: flat_load_dwordx2 v -; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 2, #1 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: @@ -195,10 +196,9 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, -; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 -; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EGCM: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EGCM-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1 +; EGCM-DAG: VTX_READ_16 {{T[0-9]\.[XYZW]}}, [[SRC]], 2, #1 +; EGCM-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1 ; EGCM: 16 define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: @@ -216,11 +216,11 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, -; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 0, #1 -; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 4, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal +; EGCM-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[SRC:T[0-9]\.[XYZW]]], 0, #1 +; EGCM-DAG: VTX_READ_16 [[DST_MID:T[0-9]\.[XYZW]]], [[SRC]], 2, #1 +; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], [[SRC]], 4, #1 +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, [[DST_LO]], 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, [[DST_MID]], 0.0, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 Index: test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-local-i16.ll +++ test/CodeGen/AMDGPU/load-local-i16.ll @@ -52,7 +52,8 @@ ; GCN-DAG: ds_write_b16 ; EG-DAG: LDS_USHORT_READ_RET -; EG-DAG: LDS_READ_RET +; EG-DAG: LDS_USHORT_READ_RET +; EG-DAG: LDS_USHORT_READ_RET define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in @@ -235,7 +236,9 @@ ; GCN-DAG: ds_write_b32 ; GCN-DAG: ds_write_b64 -; EG: LDS_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in @@ -252,7 +255,9 @@ ; GCN-DAG: ds_write_b32 ; GCN-DAG: ds_write_b64 -; EG: LDS_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT Index: test/CodeGen/X86/load-local-i1.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/load-local-i1.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s + +; widen a v3i1 to v4i1 to do a vector load/store. We would previously +; reconstruct the said v3i1 from the first element of the vector by filling all +; the lanes of the vector with that first element, which was obviously wrong. +; This was done in the type-legalizing of the DAG, when legalizing the load. + +; Function Attrs: argmemonly nounwind readonly +declare <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)*, i32, <3 x i1>, <3 x i32>) + +; Function Attrs: argmemonly nounwind +declare void @llvm.masked.store.v3i32.p1v3i32(<3 x i32>, <3 x i32> addrspace(1)*, i32, <3 x i1>) + +define <3 x i32> @masked_load_v3(i32 addrspace(1)*, <3 x i1>) { +entry: + %2 = bitcast i32 addrspace(1)* %0 to <3 x i32> addrspace(1)* + %3 = call <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)* %2, i32 4, <3 x i1> %1, <3 x i32> undef) + ret <3 x i32> %3 +} + +define void @masked_store4_v3(<3 x i32>, i32 addrspace(1)*, <3 x i1>) { +entry: + %3 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)* + call void @llvm.masked.store.v3i32.p1v3i32(<3 x i32> %0, <3 x i32> addrspace(1)* %3, i32 4, <3 x i1> %2) + ret void +} + +define void @local_load_v3i1(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <3 x i1>* %predicate_ptr) { +; CHECK-LABEL: local_load_v3i1: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset %rbx, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: movzbl (%rdx), %ebp +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: shrl %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pinsrd $1, %eax, %xmm0 +; CHECK-NEXT: shrl $2, %ebp +; CHECK-NEXT: andl $1, %ebp +; CHECK-NEXT: pinsrd $2, %ebp, %xmm0 +; CHECK-NEXT: movd %xmm0, %ebx +; CHECK-NEXT: pextrd $1, %xmm0, %r15d +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: movl %r15d, %edx +; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: callq masked_load_v3 +; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: movl %r15d, %edx +; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: callq masked_store4_v3 +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %predicate = load <3 x i1>, <3 x i1>* %predicate_ptr + %load1 = call <3 x i32> @masked_load_v3(i32 addrspace(1)* %in, <3 x i1> %predicate) + call void @masked_store4_v3(<3 x i32> %load1, i32 addrspace(1)* %out, <3 x i1> %predicate) + ret void +} Index: test/CodeGen/X86/widen_arith-3.ll =================================================================== --- test/CodeGen/X86/widen_arith-3.ll +++ test/CodeGen/X86/widen_arith-3.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $40, %esp +; CHECK-NEXT: subl $32, %esp ; CHECK-NEXT: movl {{\.LCPI.*}}, %eax ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 @@ -26,9 +26,7 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl 12(%ebp), %edx ; CHECK-NEXT: movl 8(%ebp), %ecx -; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm2 +; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; CHECK-NEXT: psubd %xmm0, %xmm2 ; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8) ; CHECK-NEXT: pshufb %xmm1, %xmm2 Index: test/CodeGen/X86/widen_cast-2.ll =================================================================== --- test/CodeGen/X86/widen_cast-2.ll +++ test/CodeGen/X86/widen_cast-2.ll @@ -22,8 +22,7 @@ ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: psubw %xmm0, %xmm2 ; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) -; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) -; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) +; CHECK-NEXT: movq %xmm2, 16(%ecx,%eax) ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) Index: test/CodeGen/X86/widen_cast-3.ll =================================================================== --- test/CodeGen/X86/widen_cast-3.ll +++ test/CodeGen/X86/widen_cast-3.ll @@ -11,8 +11,7 @@ ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubd %xmm1, %xmm0 ; X86-NEXT: pextrd $2, %xmm0, 8(%eax) -; X86-NEXT: pextrd $1, %xmm0, 4(%eax) -; X86-NEXT: movd %xmm0, (%eax) +; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert: Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -15,8 +15,7 @@ ; X86-NEXT: movdqa (%edx), %xmm0 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: pextrd $2, %xmm0, 8(%eax) -; X86-NEXT: pextrd $1, %xmm0, 4(%eax) -; X86-NEXT: movd %xmm0, (%eax) +; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32: @@ -40,16 +39,13 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pinsrd $1, 4(%edx), %xmm0 +; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: pinsrd $2, 8(%edx), %xmm0 -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 +; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: pextrd $2, %xmm1, 8(%eax) -; X86-NEXT: pextrd $1, %xmm1, 4(%eax) -; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32_2: @@ -82,8 +78,7 @@ ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -151,16 +146,12 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: pinsrd $2, 4(%edx), %xmm0 -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-NEXT: pinsrd $2, 4(%ecx), %xmm1 +; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-NEXT: paddd %xmm0, %xmm1 ; X86-NEXT: pextrw $4, %xmm1, 4(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] @@ -225,8 +216,7 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -334,8 +324,7 @@ ; X86-NEXT: pextrb $14, %xmm1, 30(%eax) ; X86-NEXT: pextrw $6, %xmm1, 28(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ;