Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4380,6 +4380,8 @@ isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) { + if (MemVTWidth == WidenWidth) + return MemVT; RetVT = MemVT; break; } @@ -4391,7 +4393,10 @@ VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) { EVT MemVT = (MVT::SimpleValueType) VT; unsigned MemVTWidth = MemVT.getSizeInBits(); - if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() && + auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT); + if ((Action == TargetLowering::TypeLegal || + Action == TargetLowering::TypePromoteInteger) && + WidenEltVT == MemVT.getVectorElementType() && (WidenWidth % MemVTWidth) == 0 && isPowerOf2_32(WidenWidth / MemVTWidth) && (MemVTWidth <= Width || Index: llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1239,11 +1239,13 @@ SDLoc DL(Op); + const bool TruncatingStore = StoreNode->isTruncatingStore(); + // Neither LOCAL nor PRIVATE can do vectors at the moment - if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS || + TruncatingStore) && VT.isVector()) { - if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && - StoreNode->isTruncatingStore()) { + if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); // TODO: can the chain be replaced without creating a new store? @@ -1269,7 +1271,7 @@ if (AS == AMDGPUAS::GLOBAL_ADDRESS) { // It is beneficial to create MSKOR here instead of combiner to avoid // artificial dependencies introduced by RMW - if (StoreNode->isTruncatingStore()) { + if (TruncatingStore) { assert(VT.bitsLE(MVT::i32)); SDValue MaskConstant; if (MemVT == MVT::i8) { @@ -1309,8 +1311,8 @@ // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr); - if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { - llvm_unreachable("Truncated and indexed stores not supported yet"); + if (StoreNode->isIndexed()) { + llvm_unreachable("Indexed stores not supported yet"); } else { Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); } Index: llvm/trunk/test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-constant-i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -29,7 +29,8 @@ ; FUNC-LABEL: {{^}}constant_load_v3i16: ; GCN: s_load_dwordx2 s -; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 2, #1 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { entry: @@ -186,15 +187,11 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, ; EG: CF_END -; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 -; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EG-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1 +; EG-DAG: VTX_READ_16 {{T[0-9]\.[XYZW]}}, [[SRC]], 2, #1 +; EG-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1 +; EG-DAG: LSHR {{[* ]*}}{{T[0-9]\.[XYZW]}}, {{T[0-9]\.[XYZW]}}, literal ; EG-DAG: 16 -; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal -; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal -; EG-DAG: 65535 -; EG-DAG: 65535 define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in @@ -209,11 +206,12 @@ ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, ; v3i16 is naturally 8 byte aligned -; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1 -; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 -; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal -; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal +; EG-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1 +; EG-DAG: VTX_READ_16 [[DST_MID:T[0-9]\.[XYZW]]], [[SRC]], 2, #1 +; EG-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1 +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, [[ST_LO]].X, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, [[DST_MID]], 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[ST_HI]].X, 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { Index: llvm/trunk/test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-global-i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-global-i16.ll @@ -34,7 +34,8 @@ ; GCN-NOHSA: buffer_load_dwordx2 v ; GCN-HSA: flat_load_dwordx2 v -; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 +; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 2, #1 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: @@ -195,10 +196,9 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, -; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 -; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EGCM: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal +; EGCM-DAG: VTX_READ_16 [[ST_LO]].X, [[SRC:T[0-9]\.[XYZW]]], 0, #1 +; EGCM-DAG: VTX_READ_16 {{T[0-9]\.[XYZW]}}, [[SRC]], 2, #1 +; EGCM-DAG: VTX_READ_16 [[ST_HI]].X, [[SRC]], 4, #1 ; EGCM: 16 define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: @@ -216,11 +216,11 @@ ; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}} ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}, ; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}}, -; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 0, #1 -; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 4, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal -; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal +; EGCM-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[SRC:T[0-9].[XYZW]]], 0, #1 +; EGCM-DAG: VTX_READ_16 [[DST_MID:T[0-9]\.[XYZW]]], [[SRC]], 2, #1 +; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], [[SRC]], 4, #1 +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, [[DST_LO]], 0.0, literal +; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, [[DST_MID]], 0.0, literal ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal ; EGCM-DAG: 16 ; EGCM-DAG: 16 Index: llvm/trunk/test/CodeGen/AMDGPU/load-local-i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-local-i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-local-i16.ll @@ -52,7 +52,7 @@ ; GCN-DAG: ds_write_b16 ; EG-DAG: LDS_USHORT_READ_RET -; EG-DAG: LDS_READ_RET +; EG-DAG: LDS_USHORT_READ_RET define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in @@ -235,7 +235,9 @@ ; GCN-DAG: ds_write_b32 ; GCN-DAG: ds_write_b64 -; EG: LDS_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in @@ -252,7 +254,9 @@ ; GCN-DAG: ds_write_b32 ; GCN-DAG: ds_write_b64 -; EG: LDS_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET +; EG: LDS_USHORT_READ_RET ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT Index: llvm/trunk/test/CodeGen/X86/load-local-v3i1.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/load-local-v3i1.ll +++ llvm/trunk/test/CodeGen/X86/load-local-v3i1.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s + +; widen a v3i1 to v4i1 to do a vector load/store. We would previously +; reconstruct the said v3i1 from the first element of the vector by filling all +; the lanes of the vector with that first element, which was obviously wrong. +; This was done in the type-legalizing of the DAG, when legalizing the load. + +; Function Attrs: argmemonly nounwind readonly +declare <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)*, i32, <3 x i1>, <3 x i32>) + +; Function Attrs: argmemonly nounwind +declare void @llvm.masked.store.v3i32.p1v3i32(<3 x i32>, <3 x i32> addrspace(1)*, i32, <3 x i1>) + +define <3 x i32> @masked_load_v3(i32 addrspace(1)*, <3 x i1>) { +entry: + %2 = bitcast i32 addrspace(1)* %0 to <3 x i32> addrspace(1)* + %3 = call <3 x i32> @llvm.masked.load.v3i32.p1v3i32(<3 x i32> addrspace(1)* %2, i32 4, <3 x i1> %1, <3 x i32> undef) + ret <3 x i32> %3 +} + +define void @masked_store4_v3(<3 x i32>, i32 addrspace(1)*, <3 x i1>) { +entry: + %3 = bitcast i32 addrspace(1)* %1 to <3 x i32> addrspace(1)* + call void @llvm.masked.store.v3i32.p1v3i32(<3 x i32> %0, <3 x i32> addrspace(1)* %3, i32 4, <3 x i1> %2) + ret void +} + +define void @local_load_v3i1(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <3 x i1>* %predicate_ptr) nounwind { +; CHECK-LABEL: local_load_v3i1: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: movzbl (%rdx), %ebp +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: shrl %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pinsrd $1, %eax, %xmm0 +; CHECK-NEXT: shrl $2, %ebp +; CHECK-NEXT: andl $1, %ebp +; CHECK-NEXT: pinsrd $2, %ebp, %xmm0 +; CHECK-NEXT: movd %xmm0, %ebx +; CHECK-NEXT: pextrd $1, %xmm0, %r15d +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: movl %r15d, %edx +; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: callq masked_load_v3 +; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: movl %r15d, %edx +; CHECK-NEXT: movl %ebp, %ecx +; CHECK-NEXT: callq masked_store4_v3 +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq + %predicate = load <3 x i1>, <3 x i1>* %predicate_ptr + %load1 = call <3 x i32> @masked_load_v3(i32 addrspace(1)* %in, <3 x i1> %predicate) + call void @masked_store4_v3(<3 x i32> %load1, i32 addrspace(1)* %out, <3 x i1> %predicate) + ret void +} Index: llvm/trunk/test/CodeGen/X86/widen_arith-3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_arith-3.ll +++ llvm/trunk/test/CodeGen/X86/widen_arith-3.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $40, %esp +; CHECK-NEXT: subl $32, %esp ; CHECK-NEXT: movl {{\.LCPI.*}}, %eax ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 @@ -26,9 +26,7 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl 12(%ebp), %edx ; CHECK-NEXT: movl 8(%ebp), %ecx -; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm2 +; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; CHECK-NEXT: psubd %xmm0, %xmm2 ; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8) ; CHECK-NEXT: pshufb %xmm1, %xmm2 Index: llvm/trunk/test/CodeGen/X86/widen_cast-2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_cast-2.ll +++ llvm/trunk/test/CodeGen/X86/widen_cast-2.ll @@ -21,9 +21,8 @@ ; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2 ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: psubw %xmm0, %xmm2 -; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) -; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) ; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) +; CHECK-NEXT: movq %xmm2, 16(%ecx,%eax) ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) Index: llvm/trunk/test/CodeGen/X86/widen_cast-3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_cast-3.ll +++ llvm/trunk/test/CodeGen/X86/widen_cast-3.ll @@ -11,8 +11,7 @@ ; X86-NEXT: pcmpeqd %xmm1, %xmm1 ; X86-NEXT: psubd %xmm1, %xmm0 ; X86-NEXT: pextrd $2, %xmm0, 8(%eax) -; X86-NEXT: pextrd $1, %xmm0, 4(%eax) -; X86-NEXT: movd %xmm0, (%eax) +; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: convert: Index: llvm/trunk/test/CodeGen/X86/widen_load-2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_load-2.ll +++ llvm/trunk/test/CodeGen/X86/widen_load-2.ll @@ -15,8 +15,7 @@ ; X86-NEXT: movdqa (%edx), %xmm0 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: pextrd $2, %xmm0, 8(%eax) -; X86-NEXT: pextrd $1, %xmm0, 4(%eax) -; X86-NEXT: movd %xmm0, (%eax) +; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32: @@ -40,16 +39,13 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pinsrd $1, 4(%edx), %xmm0 +; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: pinsrd $2, 8(%edx), %xmm0 -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 +; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: pextrd $1, %xmm1, 4(%eax) +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: pextrd $2, %xmm1, 8(%eax) -; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32_2: @@ -81,9 +77,8 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -151,16 +146,12 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 16(%ebp), %ecx ; X86-NEXT: movl 12(%ebp), %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: pinsrd $2, 4(%edx), %xmm0 -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-NEXT: pinsrd $2, 4(%ecx), %xmm1 +; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; X86-NEXT: paddd %xmm0, %xmm1 ; X86-NEXT: pextrw $4, %xmm1, 4(%eax) ; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] @@ -225,8 +216,7 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -331,11 +321,10 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddb (%ecx), %xmm0 ; X86-NEXT: paddb 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) ; X86-NEXT: pextrw $6, %xmm1, 28(%eax) ; X86-NEXT: pextrb $14, %xmm1, 30(%eax) +; X86-NEXT: movq %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ;