Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -160,6 +160,9 @@ /// and zero out the high word. MMX_MOVW2D, + /// Build MMX vector from x86mmx source values in lowest elements. + MMX_BUILD_VECTOR, + /// Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25267,6 +25267,7 @@ case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; + case X86ISD::MMX_BUILD_VECTOR: return "X86ISD::MMX_BUILD_VECTOR"; case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; @@ -30783,6 +30784,48 @@ } } + // Detect bitcasts of 64-bit build vectors and convert to a + // MMX_BUILD_VECTOR which takes MMX type inputs with the value in the + // lowest element. + if (N0.getOpcode() == ISD::BUILD_VECTOR && + (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || + SrcVT == MVT::v8i8)) { + SDLoc DL(N0); + auto CreateMMXElement = [&](SDValue V) { + if (V.isUndef()) + return DAG.getUNDEF(MVT::x86mmx); + if (V.getValueType().isFloatingPoint()) { + if (Subtarget.hasSSE1() && Subtarget.is64Bit() && + !isa(V)) { + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32, + DAG.getUNDEF(MVT::v4f32), V, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, + DAG.getBitcast(MVT::v2i64, V)); + } + V = DAG.getBitcast(MVT::i32, V); + } else { + V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32); + } + return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V); + }; + + SmallVector Ops; + unsigned NumElts = N0.getNumOperands(); + auto *BV = cast(N0); + if (SDValue Splat = BV->getSplatValue()) { + // If its a splat then we need to ensure that all elements match for + // the broadcast pattern to work. + if (Splat.isUndef()) + return DAG.getUNDEF(VT); + Ops.append(NumElts, CreateMMXElement(Splat)); + } else { + for (unsigned i = 0; i != NumElts; ++i) + Ops.push_back(CreateMMXElement(N0.getOperand(i))); + } + return DAG.getNode(X86ISD::MMX_BUILD_VECTOR, DL, VT, Ops); + } + // Detect bitcasts between element or subvector extraction to x86mmx. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -21,6 +21,9 @@ // GPR to low word of MMX. def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1, [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>; +// Build MMX vector from source values in lowest elements of x86mmx types. +def MMX_X86buildvector : SDNode<"X86ISD::MMX_BUILD_VECTOR", SDTypeProfile<1, -1, + [SDTCisVT<0, x86mmx>]>>; //===----------------------------------------------------------------------===// // MMX Pattern Fragments Index: lib/Target/X86/X86InstrMMX.td =================================================================== --- lib/Target/X86/X86InstrMMX.td +++ lib/Target/X86/X86InstrMMX.td @@ -680,3 +680,41 @@ (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), (MMX_CVTTPD2PIirr VR128:$src)>; } + +// Build Vectors - stagger complexity to match the longer inputs. +let Predicates = [HasMMX] in { + def : Pat<(x86mmx (MMX_X86buildvector VR64:$src1, VR64:$src2)), + (MMX_PUNPCKLDQirr VR64:$src1, VR64:$src2)>; + let AddedComplexity = 10 in + def : Pat<(x86mmx (MMX_X86buildvector VR64:$src1, VR64:$src2, VR64:$src3, VR64:$src4)), + (MMX_PUNPCKLDQirr + (MMX_PUNPCKLWDirr VR64:$src1, VR64:$src2), + (MMX_PUNPCKLWDirr VR64:$src3, VR64:$src4))>; + let AddedComplexity = 20 in + def : Pat<(x86mmx (MMX_X86buildvector VR64:$src1, VR64:$src2, VR64:$src3, VR64:$src4, + VR64:$src5, VR64:$src6, VR64:$src7, VR64:$src8)), + (MMX_PUNPCKLDQirr + (MMX_PUNPCKLWDirr + (MMX_PUNPCKLBWirr VR64:$src1, VR64:$src2), + (MMX_PUNPCKLBWirr VR64:$src3, VR64:$src4)), + (MMX_PUNPCKLWDirr + (MMX_PUNPCKLBWirr VR64:$src5, VR64:$src6), + (MMX_PUNPCKLBWirr VR64:$src7, VR64:$src8)))>; + + // Broadcasts. + let AddedComplexity = 30 in + def : Pat<(x86mmx (MMX_X86buildvector VR64:$src, VR64:$src)), + (MMX_PSHUFWri VR64:$src, 0x44)>; + let AddedComplexity = 40 in + def : Pat<(x86mmx (MMX_X86buildvector VR64:$src, VR64:$src, VR64:$src, VR64:$src)), + (MMX_PSHUFWri VR64:$src, 0x00)>; + let AddedComplexity = 50 in + def : Pat<(x86mmx (MMX_X86buildvector VR64:$src, VR64:$src, VR64:$src, VR64:$src, + VR64:$src, VR64:$src, VR64:$src, VR64:$src)), + (MMX_PSHUFWri (MMX_PUNPCKLBWirr VR64:$src, VR64:$src), 0x00)>; + let Predicates = [HasSSSE3], AddedComplexity = 60 in { + def : Pat<(x86mmx (MMX_X86buildvector VR64:$src, VR64:$src, VR64:$src, VR64:$src, + VR64:$src, VR64:$src, VR64:$src, VR64:$src)), + (MMX_PSHUFBrr VR64:$src, (MMX_SET0))>; + } +} Index: test/CodeGen/X86/3dnow-intrinsics.ll =================================================================== --- test/CodeGen/X86/3dnow-intrinsics.ll +++ test/CodeGen/X86/3dnow-intrinsics.ll @@ -35,12 +35,11 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: pf2id {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: pf2id %mm1, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -71,18 +70,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfacc {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfacc %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -113,18 +109,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfadd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfadd %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -155,18 +148,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfcmpeq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfcmpeq %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ebp, %esp @@ -198,18 +188,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfcmpge {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfcmpge %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ebp, %esp @@ -241,18 +228,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfcmpgt {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfcmpgt %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ebp, %esp @@ -284,18 +268,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfmax {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfmax %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -326,18 +307,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfmin {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfmin %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -368,18 +346,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfmul {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfmul %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -410,12 +385,11 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: pfrcp {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: pfrcp %mm1, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) @@ -445,18 +419,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfrcpit1 {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfrcpit1 %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -487,18 +458,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfrcpit2 {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfrcpit2 %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -529,12 +497,11 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: pfrsqrt {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: pfrsqrt %mm1, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) @@ -564,18 +531,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfrsqit1 {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfrsqit1 %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -606,18 +570,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfsub {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfsub %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -648,18 +609,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfsubr {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfsubr %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -748,12 +706,11 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: pf2iw {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: pf2iw %mm1, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -784,18 +741,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfnacc {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfnacc %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -826,18 +780,15 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $24, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 20(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 16(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: pfpnacc {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 20(%ebp), %mm0 +; X86-NEXT: movd 16(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm2 +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: pfpnacc %mm1, %mm2 +; X86-NEXT: movq %mm2, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: movl %ebp, %esp @@ -899,12 +850,11 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: flds 12(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: flds 8(%ebp) -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: pswapd {{[0-9]+}}(%esp), %mm0 # mm0 = mem[1,0] +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0] ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) @@ -932,12 +882,11 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: pswapd {{[0-9]+}}(%esp), %mm0 # mm0 = mem[1,0] +; X86-NEXT: subl $8, %esp +; X86-NEXT: movd 12(%ebp), %mm0 +; X86-NEXT: movd 8(%ebp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0] ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx Index: test/CodeGen/X86/fast-isel-bc.ll =================================================================== --- test/CodeGen/X86/fast-isel-bc.ll +++ test/CodeGen/X86/fast-isel-bc.ll @@ -17,9 +17,11 @@ ; X86-LABEL: func1: ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movsd %xmm0, (%esp) -; X86-NEXT: movq (%esp), %mm0 +; X86-NEXT: movl $2, %eax +; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: pxor %mm1, %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 ## mm1 = mm1[0],mm0[0] +; X86-NEXT: movq %mm1, %mm0 ; X86-NEXT: calll _func2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: retl @@ -28,13 +30,10 @@ ; X64: ## %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: movl $2, %eax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: movq %rcx, %xmm0 -; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rsp) -; X64-NEXT: movq (%rsp), %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movd %eax, %mm0 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 ## mm1 = mm1[0],mm0[0] +; X64-NEXT: movq2dq %mm1, %xmm0 ; X64-NEXT: callq _func2 ; X64-NEXT: popq %rax ; X64-NEXT: retq Index: test/CodeGen/X86/mmx-build-vector.ll =================================================================== --- test/CodeGen/X86/mmx-build-vector.ll +++ test/CodeGen/X86/mmx-build-vector.ll @@ -15,60 +15,24 @@ ; define void @build_v2i32_01(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind { -; X86-MMX-LABEL: build_v2i32_01: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 12(%ebp), %ecx -; X86-MMX-NEXT: movl 16(%ebp), %edx -; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movl %ecx, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v2i32_01: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movlps %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v2i32_01: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: movd %esi, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq +; X86-LABEL: build_v2i32_01: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: paddd %mm1, %mm1 +; X86-NEXT: movq %mm1, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v2i32_01: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %esi, %xmm0 -; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v2i32_01: +; X64: # %bb.0: +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: movd %esi, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 %a0, i32 0 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1 %3 = bitcast <2 x i32> %2 to x86_mmx @@ -101,78 +65,22 @@ } define void @build_v2i32_u1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind { -; X86-MMX-LABEL: build_v2i32_u1: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 16(%ebp), %ecx -; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl +; X86-LABEL: build_v2i32_u1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X86-NEXT: paddd %mm0, %mm0 +; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: retl ; -; X86-SSE-LABEL: build_v2i32_u1: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v2i32_u1: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: build_v2i32_u1: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovd %edx, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX1-NEXT: paddd %mm0, %mm0 -; X64-AVX1-NEXT: movq %mm0, (%rdi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: build_v2i32_u1: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovd %edx, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX2-NEXT: paddd %mm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, (%rdi) -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: build_v2i32_u1: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovd %edx, %xmm0 -; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX512-NEXT: paddd %mm0, %mm0 -; X64-AVX512-NEXT: movq %mm0, (%rdi) -; X64-AVX512-NEXT: retq +; X64-LABEL: build_v2i32_u1: +; X64: # %bb.0: +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X64-NEXT: paddd %mm0, %mm0 +; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 undef, i32 0 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1 %3 = bitcast <2 x i32> %2 to x86_mmx @@ -182,63 +90,24 @@ } define void @build_v2i32_z1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind { -; X86-MMX-LABEL: build_v2i32_z1: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 16(%ebp), %ecx -; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movl $0, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v2i32_z1: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v2i32_z1: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: # kill: def $edx killed $edx def $rdx -; X64-SSE-NEXT: movq %rdx, %xmm0 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq +; X86-LABEL: build_v2i32_z1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pxor %mm1, %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: paddd %mm1, %mm1 +; X86-NEXT: movq %mm1, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v2i32_z1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: # kill: def $edx killed $edx def $rdx -; X64-AVX-NEXT: vmovq %rdx, %xmm0 -; X64-AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v2i32_z1: +; X64: # %bb.0: +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 0, i32 0 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1 %3 = bitcast <2 x i32> %2 to x86_mmx @@ -248,79 +117,22 @@ } define void @build_v2i32_00(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind { -; X86-MMX-LABEL: build_v2i32_00: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 12(%ebp), %ecx -; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movl %ecx, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl +; X86-LABEL: build_v2i32_00: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X86-NEXT: paddd %mm0, %mm0 +; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: retl ; -; X86-SSE-LABEL: build_v2i32_00: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v2i32_00: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movd %esi, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: build_v2i32_00: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovd %esi, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX1-NEXT: paddd %mm0, %mm0 -; X64-AVX1-NEXT: movq %mm0, (%rdi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: build_v2i32_00: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovd %esi, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX2-NEXT: paddd %mm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, (%rdi) -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: build_v2i32_00: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovd %esi, %xmm0 -; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX512-NEXT: paddd %mm0, %mm0 -; X64-AVX512-NEXT: movq %mm0, (%rdi) -; X64-AVX512-NEXT: retq +; X64-LABEL: build_v2i32_00: +; X64: # %bb.0: +; X64-NEXT: movd %esi, %mm0 +; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X64-NEXT: paddd %mm0, %mm0 +; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 %a0, i32 0 %2 = insertelement <2 x i32> %1, i32 %a0, i32 1 %3 = bitcast <2 x i32> %2 to x86_mmx @@ -334,95 +146,32 @@ ; define void @build_v4i16_0123(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { -; X86-MMX-LABEL: build_v4i16_0123: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 24(%ebp), %ecx -; X86-MMX-NEXT: shll $16, %ecx -; X86-MMX-NEXT: movzwl 20(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movl 16(%ebp), %ecx -; X86-MMX-NEXT: shll $16, %ecx -; X86-MMX-NEXT: movzwl 12(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v4i16_0123: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0 -; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0 -; X86-SSE-NEXT: pinsrw $3, 24(%ebp), %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE2-LABEL: build_v4i16_0123: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %r8d, %xmm0 -; X64-SSE2-NEXT: movd %ecx, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movd %edx, %xmm0 -; X64-SSE2-NEXT: movd %esi, %xmm2 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v4i16_0123: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %r8d, %xmm0 -; X64-SSSE3-NEXT: movd %ecx, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSSE3-NEXT: movd %edx, %xmm0 -; X64-SSSE3-NEXT: movd %esi, %xmm2 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-SSSE3-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v4i16_0123: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 +; X86-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X86-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0] +; X86-NEXT: paddd %mm2, %mm2 +; X86-NEXT: movq %mm2, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v4i16_0123: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %esi, %xmm0 -; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrd $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v4i16_0123: +; X64: # %bb.0: +; X64-NEXT: movd %r8d, %mm0 +; X64-NEXT: movd %ecx, %mm1 +; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: movd %esi, %mm2 +; X64-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X64-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0] +; X64-NEXT: paddd %mm2, %mm2 +; X64-NEXT: movq %mm2, (%rdi) +; X64-NEXT: retq %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 %a1, i32 1 %3 = insertelement <4 x i16> %2, i16 %a2, i32 2 @@ -434,105 +183,30 @@ } define void @build_v4i16_01zz(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { -; X86-MMX-LABEL: build_v4i16_01zz: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 16(%ebp), %ecx -; X86-MMX-NEXT: shll $16, %ecx -; X86-MMX-NEXT: movzwl 12(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, (%esp) -; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE2-LABEL: build_v4i16_01zz: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v4i16_01zz: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-SSSE3-NEXT: movq %xmm1, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v4i16_01zz: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %edx, %xmm0 -; X64-SSE2-NEXT: movd %esi, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v4i16_01zz: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %edx, %xmm0 -; X64-SSSE3-NEXT: movd %esi, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v4i16_01zz: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X86-NEXT: pxor %mm0, %mm0 +; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: paddd %mm1, %mm1 +; X86-NEXT: movq %mm1, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v4i16_01zz: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %edx, %xmm0 -; X64-AVX-NEXT: vmovd %esi, %xmm1 -; X64-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v4i16_01zz: +; X64: # %bb.0: +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: movd %esi, %mm1 +; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X64-NEXT: pxor %mm0, %mm0 +; X64-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) +; X64-NEXT: retq %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 %a1, i32 1 %3 = insertelement <4 x i16> %2, i16 0, i32 2 @@ -596,88 +270,30 @@ } define void @build_v4i16_012u(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { -; X86-MMX-LABEL: build_v4i16_012u: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 16(%ebp), %ecx -; X86-MMX-NEXT: shll $16, %ecx -; X86-MMX-NEXT: movzwl 12(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, (%esp) -; X86-MMX-NEXT: shll $16, %ecx -; X86-MMX-NEXT: movzwl 20(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v4i16_012u: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0 -; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE2-LABEL: build_v4i16_012u: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %edx, %xmm0 -; X64-SSE2-NEXT: movd %esi, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE2-NEXT: movd %ecx, %xmm0 -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v4i16_012u: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %edx, %xmm0 -; X64-SSSE3-NEXT: movd %esi, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSSE3-NEXT: movd %ecx, %xmm0 -; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v4i16_012u: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 +; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: paddd %mm2, %mm2 +; X86-NEXT: movq %mm2, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v4i16_012u: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %esi, %xmm0 -; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v4i16_012u: +; X64: # %bb.0: +; X64-NEXT: movd %ecx, %mm0 +; X64-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] +; X64-NEXT: movd %edx, %mm1 +; X64-NEXT: movd %esi, %mm2 +; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X64-NEXT: paddd %mm2, %mm2 +; X64-NEXT: movq %mm2, (%rdi) +; X64-NEXT: retq %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 %a1, i32 1 %3 = insertelement <4 x i16> %2, i16 %a2, i32 2 @@ -689,117 +305,22 @@ } define void @build_v4i16_0u00(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { -; X86-MMX-LABEL: build_v4i16_0u00: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movzwl 12(%ebp), %ecx -; X86-MMX-NEXT: movl %ecx, %edx -; X86-MMX-NEXT: shll $16, %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: shll $16, %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE2-LABEL: build_v4i16_0u00: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] -; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v4i16_0u00: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3] -; X86-SSSE3-NEXT: movq %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v4i16_0u00: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %esi, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] -; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v4i16_0u00: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %esi, %xmm0 -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,2,3] -; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v4i16_0u00: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] +; X86-NEXT: paddd %mm0, %mm0 +; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: retl ; -; X64-AVX1-LABEL: build_v4i16_0u00: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovd %esi, %xmm0 -; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,0,1,0,1,0,1,0,1,0,1,2,3] -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX1-NEXT: paddd %mm0, %mm0 -; X64-AVX1-NEXT: movq %mm0, (%rdi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: build_v4i16_0u00: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovd %esi, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX2-NEXT: paddd %mm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, (%rdi) -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: build_v4i16_0u00: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovd %esi, %xmm0 -; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX512-NEXT: paddd %mm0, %mm0 -; X64-AVX512-NEXT: movq %mm0, (%rdi) -; X64-AVX512-NEXT: retq +; X64-LABEL: build_v4i16_0u00: +; X64: # %bb.0: +; X64-NEXT: movd %esi, %mm0 +; X64-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] +; X64-NEXT: paddd %mm0, %mm0 +; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: retq %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 undef, i32 1 %3 = insertelement <4 x i16> %2, i16 %a0, i32 2 @@ -815,124 +336,48 @@ ; define void @build_v8i8_01234567(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-MMX-LABEL: build_v8i8_01234567: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: pushl %esi -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $16, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 40(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 36(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: shll $16, %edx -; X86-MMX-NEXT: movl 32(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 28(%ebp), %esi -; X86-MMX-NEXT: orl %ecx, %esi -; X86-MMX-NEXT: movzwl %si, %ecx -; X86-MMX-NEXT: orl %edx, %ecx -; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movl 24(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 20(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: shll $16, %edx -; X86-MMX-NEXT: movl 16(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 12(%ebp), %esi -; X86-MMX-NEXT: orl %ecx, %esi -; X86-MMX-NEXT: movzwl %si, %ecx -; X86-MMX-NEXT: orl %edx, %ecx -; X86-MMX-NEXT: movl %ecx, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: leal -4(%ebp), %esp -; X86-MMX-NEXT: popl %esi -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v8i8_01234567: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movl 24(%ebp), %ecx -; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 20(%ebp), %edx -; X86-SSE-NEXT: orl %ecx, %edx -; X86-SSE-NEXT: movl 16(%ebp), %ecx -; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 12(%ebp), %esi -; X86-SSE-NEXT: orl %ecx, %esi -; X86-SSE-NEXT: movd %esi, %xmm0 -; X86-SSE-NEXT: pinsrw $1, %edx, %xmm0 -; X86-SSE-NEXT: movl 32(%ebp), %ecx -; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 28(%ebp), %edx -; X86-SSE-NEXT: orl %ecx, %edx -; X86-SSE-NEXT: pinsrw $2, %edx, %xmm0 -; X86-SSE-NEXT: movl 40(%ebp), %ecx -; X86-SSE-NEXT: shll $8, %ecx -; X86-SSE-NEXT: movzbl 36(%ebp), %edx -; X86-SSE-NEXT: orl %ecx, %edx -; X86-SSE-NEXT: pinsrw $3, %edx, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: leal -4(%ebp), %esp -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v8i8_01234567: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: shll $8, %r8d -; X64-SSE-NEXT: movzbl %cl, %eax -; X64-SSE-NEXT: orl %r8d, %eax -; X64-SSE-NEXT: shll $8, %edx -; X64-SSE-NEXT: movzbl %sil, %ecx -; X64-SSE-NEXT: orl %edx, %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: pinsrw $1, %eax, %xmm0 -; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-SSE-NEXT: shll $8, %eax -; X64-SSE-NEXT: movzbl %r9b, %ecx -; X64-SSE-NEXT: orl %eax, %ecx -; X64-SSE-NEXT: pinsrw $2, %ecx, %xmm0 -; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-SSE-NEXT: shll $8, %eax -; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; X64-SSE-NEXT: orl %eax, %ecx -; X64-SSE-NEXT: pinsrw $3, %ecx, %xmm0 -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq +; X86-LABEL: build_v8i8_01234567: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 +; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] +; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm3 +; X86-NEXT: punpcklbw %mm0, %mm3 # mm3 = mm3[0],mm0[0],mm3[1],mm0[1],mm3[2],mm0[2],mm3[3],mm0[3] +; X86-NEXT: punpcklwd %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1] +; X86-NEXT: punpckldq %mm2, %mm3 # mm3 = mm3[0],mm2[0] +; X86-NEXT: paddd %mm3, %mm3 +; X86-NEXT: movq %mm3, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v8i8_01234567: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %esi, %xmm0 -; X64-AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrb $4, %r9d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v8i8_01234567: +; X64: # %bb.0: +; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm0 +; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm1 +; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: movd %r9d, %mm0 +; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm2 +; X64-NEXT: punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3] +; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] +; X64-NEXT: movd %r8d, %mm1 +; X64-NEXT: movd %ecx, %mm2 +; X64-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3] +; X64-NEXT: movd %edx, %mm1 +; X64-NEXT: movd %esi, %mm3 +; X64-NEXT: punpcklbw %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1],mm3[2],mm1[2],mm3[3],mm1[3] +; X64-NEXT: punpcklwd %mm2, %mm3 # mm3 = mm3[0],mm2[0],mm3[1],mm2[1] +; X64-NEXT: punpckldq %mm0, %mm3 # mm3 = mm3[0],mm0[0] +; X64-NEXT: paddd %mm3, %mm3 +; X64-NEXT: movq %mm3, (%rdi) +; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 %a1, i32 1 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2 @@ -948,158 +393,46 @@ } define void @build_v8i8_0u2345z7(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-MMX-LABEL: build_v8i8_0u2345z7: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 24(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 20(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: shll $16, %edx -; X86-MMX-NEXT: movzbl 12(%ebp), %ecx -; X86-MMX-NEXT: orl %edx, %ecx -; X86-MMX-NEXT: movl %ecx, (%esp) -; X86-MMX-NEXT: movl 32(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 28(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movzwl %dx, %ecx -; X86-MMX-NEXT: movl 40(%ebp), %edx -; X86-MMX-NEXT: shll $24, %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE2-LABEL: build_v8i8_0u2345z7: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: pxor %xmm1, %xmm1 -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v8i8_0u2345z7: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] -; X86-SSSE3-NEXT: movq %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v8i8_0u2345z7: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-SSE2-NEXT: movd %r9d, %xmm0 -; X64-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: movd %r8d, %xmm1 -; X64-SSE2-NEXT: movd %ecx, %xmm2 -; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X64-SSE2-NEXT: movd %esi, %xmm1 -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; X64-SSE2-NEXT: packuswb %xmm1, %xmm1 -; X64-SSE2-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v8i8_0u2345z7: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-SSSE3-NEXT: movd %r9d, %xmm0 -; X64-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSSE3-NEXT: movd %r8d, %xmm1 -; X64-SSSE3-NEXT: movd %ecx, %xmm2 -; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X64-SSSE3-NEXT: movd %esi, %xmm1 -; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[4,6,8,10],zero,xmm1[14,u,u,u,u,u,u,u,u] -; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v8i8_0u2345z7: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pxor %mm1, %mm1 +; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 +; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] +; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] +; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] +; X86-NEXT: punpckldq %mm2, %mm0 # mm0 = mm0[0],mm2[0] +; X86-NEXT: paddd %mm0, %mm0 +; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v8i8_0u2345z7: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $4, %r9d, %xmm0, %xmm0 -; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 -; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v8i8_0u2345z7: +; X64: # %bb.0: +; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm0 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: movd %r9d, %mm0 +; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm2 +; X64-NEXT: punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3] +; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] +; X64-NEXT: movd %r8d, %mm1 +; X64-NEXT: movd %ecx, %mm2 +; X64-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3] +; X64-NEXT: movd %esi, %mm1 +; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: punpcklwd %mm2, %mm1 # mm1 = mm1[0],mm2[0],mm1[1],mm2[1] +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) +; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 undef, i32 1 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2 @@ -1115,128 +448,44 @@ } define void @build_v8i8_0123zzzu(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-MMX-LABEL: build_v8i8_0123zzzu: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: pushl %esi -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $16, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movl 24(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 20(%ebp), %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: shll $16, %edx -; X86-MMX-NEXT: movl 16(%ebp), %ecx -; X86-MMX-NEXT: shll $8, %ecx -; X86-MMX-NEXT: movzbl 12(%ebp), %esi -; X86-MMX-NEXT: orl %ecx, %esi -; X86-MMX-NEXT: movzwl %si, %ecx -; X86-MMX-NEXT: orl %edx, %ecx -; X86-MMX-NEXT: movl %ecx, (%esp) -; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: leal -4(%ebp), %esp -; X86-MMX-NEXT: popl %esi -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE2-LABEL: build_v8i8_0123zzzu: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movl 12(%ebp), %ecx -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: pinsrw $0, %ecx, %xmm0 -; X86-SSE2-NEXT: movl 16(%ebp), %ecx -; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0 -; X86-SSE2-NEXT: movl 20(%ebp), %ecx -; X86-SSE2-NEXT: pinsrw $2, %ecx, %xmm0 -; X86-SSE2-NEXT: movl 24(%ebp), %ecx -; X86-SSE2-NEXT: pinsrw $3, %ecx, %xmm0 -; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v8i8_0123zzzu: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movl 12(%ebp), %ecx -; X86-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X86-SSSE3-NEXT: pinsrw $0, %ecx, %xmm0 -; X86-SSSE3-NEXT: movl 16(%ebp), %ecx -; X86-SSSE3-NEXT: pinsrw $1, %ecx, %xmm0 -; X86-SSSE3-NEXT: movl 20(%ebp), %ecx -; X86-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0 -; X86-SSSE3-NEXT: movl 24(%ebp), %ecx -; X86-SSSE3-NEXT: pinsrw $3, %ecx, %xmm0 -; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; X86-SSSE3-NEXT: movq %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v8i8_0123zzzu: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm0, %xmm0 -; X64-SSE2-NEXT: pinsrw $0, %esi, %xmm0 -; X64-SSE2-NEXT: pinsrw $1, %edx, %xmm0 -; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0 -; X64-SSE2-NEXT: pinsrw $3, %r8d, %xmm0 -; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v8i8_0123zzzu: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X64-SSSE3-NEXT: pinsrw $0, %esi, %xmm0 -; X64-SSSE3-NEXT: pinsrw $1, %edx, %xmm0 -; X64-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0 -; X64-SSSE3-NEXT: pinsrw $3, %r8d, %xmm0 -; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v8i8_0123zzzu: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 +; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] +; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X86-NEXT: pxor %mm0, %mm0 +; X86-NEXT: pxor %mm1, %mm1 +; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] +; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] +; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: paddd %mm2, %mm2 +; X86-NEXT: movq %mm2, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v8i8_0123zzzu: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v8i8_0123zzzu: +; X64: # %bb.0: +; X64-NEXT: movd %r8d, %mm0 +; X64-NEXT: movd %ecx, %mm1 +; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: movd %esi, %mm2 +; X64-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] +; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X64-NEXT: pxor %mm0, %mm0 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] +; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] +; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X64-NEXT: paddd %mm2, %mm2 +; X64-NEXT: movq %mm2, (%rdi) +; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 %a1, i32 1 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2 @@ -1314,119 +563,60 @@ define void @build_v8i8_00000000(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { ; X86-MMX-LABEL: build_v8i8_00000000: ; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: movzbl 12(%ebp), %ecx -; X86-MMX-NEXT: movl %ecx, %edx -; X86-MMX-NEXT: shll $8, %edx -; X86-MMX-NEXT: orl %ecx, %edx -; X86-MMX-NEXT: movl %edx, %ecx -; X86-MMX-NEXT: shll $16, %ecx -; X86-MMX-NEXT: orl %edx, %ecx -; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movl %ecx, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 +; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-MMX-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] +; X86-MMX-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] ; X86-MMX-NEXT: paddd %mm0, %mm0 ; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp ; X86-MMX-NEXT: retl ; ; X86-SSE2-LABEL: build_v8i8_00000000: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-8, %esp -; X86-SSE2-NEXT: subl $8, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X86-SSE2-NEXT: movq %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-SSE2-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] +; X86-SSE2-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] ; X86-SSE2-NEXT: paddd %mm0, %mm0 ; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSSE3-LABEL: build_v8i8_00000000: ; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-8, %esp -; X86-SSSE3-NEXT: subl $8, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: pxor %xmm0, %xmm0 -; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: pshufb %xmm0, %xmm1 -; X86-SSSE3-NEXT: movq %xmm1, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp +; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSSE3-NEXT: pxor %mm0, %mm0 +; X86-SSSE3-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-SSSE3-NEXT: pshufb %mm0, %mm1 +; X86-SSSE3-NEXT: paddd %mm1, %mm1 +; X86-SSSE3-NEXT: movq %mm1, (%eax) ; X86-SSSE3-NEXT: retl ; ; X64-SSE2-LABEL: build_v8i8_00000000: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movd %esi, %xmm0 -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-SSE2-NEXT: packuswb %xmm0, %xmm0 -; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE2-NEXT: movd %esi, %mm0 +; X64-SSE2-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] +; X64-SSE2-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] ; X64-SSE2-NEXT: paddd %mm0, %mm0 ; X64-SSE2-NEXT: movq %mm0, (%rdi) ; X64-SSE2-NEXT: retq ; ; X64-SSSE3-LABEL: build_v8i8_00000000: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movd %esi, %xmm0 -; X64-SSSE3-NEXT: pxor %xmm1, %xmm1 -; X64-SSSE3-NEXT: pshufb %xmm1, %xmm0 -; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSSE3-NEXT: movd %esi, %mm0 +; X64-SSSE3-NEXT: pxor %mm1, %mm1 +; X64-SSSE3-NEXT: pshufb %mm1, %mm0 ; X64-SSSE3-NEXT: paddd %mm0, %mm0 ; X64-SSSE3-NEXT: movq %mm0, (%rdi) ; X64-SSSE3-NEXT: retq ; -; X64-AVX1-LABEL: build_v8i8_00000000: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovd %esi, %xmm0 -; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX1-NEXT: paddd %mm0, %mm0 -; X64-AVX1-NEXT: movq %mm0, (%rdi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: build_v8i8_00000000: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovd %esi, %xmm0 -; X64-AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX2-NEXT: paddd %mm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, (%rdi) -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: build_v8i8_00000000: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vmovd %esi, %xmm0 -; X64-AVX512-NEXT: vpbroadcastb %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX512-NEXT: paddd %mm0, %mm0 -; X64-AVX512-NEXT: movq %mm0, (%rdi) -; X64-AVX512-NEXT: retq +; X64-AVX-LABEL: build_v8i8_00000000: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: movd %esi, %mm0 +; X64-AVX-NEXT: pxor %mm1, %mm1 +; X64-AVX-NEXT: pshufb %mm1, %mm0 +; X64-AVX-NEXT: paddd %mm0, %mm0 +; X64-AVX-NEXT: movq %mm0, (%rdi) +; X64-AVX-NEXT: retq %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 %a0, i32 1 %3 = insertelement <8 x i8> %2, i8 %a0, i32 2 @@ -1446,57 +636,24 @@ ; define void @build_v2f32_01(x86_mmx *%p0, float %a0, float %a1) nounwind { -; X86-MMX-LABEL: build_v2f32_01: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: flds 12(%ebp) -; X86-MMX-NEXT: flds 16(%ebp) -; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp) -; X86-MMX-NEXT: fstps (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v2f32_01: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-16, %esp -; X86-SSE-NEXT: subl $32, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movaps %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v2f32_01: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq +; X86-LABEL: build_v2f32_01: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: paddd %mm1, %mm1 +; X86-NEXT: movq %mm1, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v2f32_01: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v2f32_01: +; X64: # %bb.0: +; X64-NEXT: movdq2q %xmm1, %mm0 +; X64-NEXT: movdq2q %xmm0, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x float> undef, float %a0, i32 0 %2 = insertelement <2 x float> %1, float %a1, i32 1 %3 = bitcast <2 x float> %2 to x86_mmx @@ -1506,78 +663,24 @@ } define void @build_v2f32_0z(x86_mmx *%p0, float %a0, float %a1) nounwind { -; X86-MMX-LABEL: build_v2f32_0z: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: flds 12(%ebp) -; X86-MMX-NEXT: fstps (%esp) -; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl +; X86-LABEL: build_v2f32_0z: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pxor %mm0, %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: paddd %mm1, %mm1 +; X86-NEXT: movq %mm1, (%eax) +; X86-NEXT: retl ; -; X86-SSE-LABEL: build_v2f32_0z: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-16, %esp -; X86-SSE-NEXT: subl $32, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movaps %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v2f32_0z: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: xorps %xmm1, %xmm1 -; X64-SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: build_v2f32_0z: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX1-NEXT: paddd %mm0, %mm0 -; X64-AVX1-NEXT: movq %mm0, (%rdi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: build_v2f32_0z: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX2-NEXT: paddd %mm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, (%rdi) -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: build_v2f32_0z: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX512-NEXT: paddd %mm0, %mm0 -; X64-AVX512-NEXT: movq %mm0, (%rdi) -; X64-AVX512-NEXT: retq +; X64-LABEL: build_v2f32_0z: +; X64: # %bb.0: +; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] +; X64-NEXT: paddd %mm0, %mm0 +; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x float> undef, float %a0, i32 0 %2 = insertelement <2 x float> %1, float 0.0, i32 1 %3 = bitcast <2 x float> %2 to x86_mmx @@ -1587,100 +690,22 @@ } define void @build_v2f32_u1(x86_mmx *%p0, float %a0, float %a1) nounwind { -; X86-MMX-LABEL: build_v2f32_u1: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: flds 16(%ebp) -; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE2-LABEL: build_v2f32_u1: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-16, %esp -; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3] -; X86-SSE2-NEXT: movaps %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v2f32_u1: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-16, %esp -; X86-SSSE3-NEXT: subl $32, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; X86-SSSE3-NEXT: movaps %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v2f32_u1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,2,3] -; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v2f32_u1: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2] -; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v2f32_u1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X86-NEXT: paddd %mm0, %mm0 +; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: retl ; -; X64-AVX1-LABEL: build_v2f32_u1: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm1[0,0,2,2] -; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX1-NEXT: paddd %mm0, %mm0 -; X64-AVX1-NEXT: movq %mm0, (%rdi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: build_v2f32_u1: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastss %xmm1, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX2-NEXT: paddd %mm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, (%rdi) -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: build_v2f32_u1: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss %xmm1, %xmm0 -; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX512-NEXT: paddd %mm0, %mm0 -; X64-AVX512-NEXT: movq %mm0, (%rdi) -; X64-AVX512-NEXT: retq +; X64-LABEL: build_v2f32_u1: +; X64: # %bb.0: +; X64-NEXT: movdq2q %xmm1, %mm0 +; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X64-NEXT: paddd %mm0, %mm0 +; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x float> undef, float undef, i32 0 %2 = insertelement <2 x float> %1, float %a1, i32 1 %3 = bitcast <2 x float> %2 to x86_mmx @@ -1690,61 +715,24 @@ } define void @build_v2f32_z1(x86_mmx *%p0, float %a0, float %a1) nounwind { -; X86-MMX-LABEL: build_v2f32_z1: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: flds 16(%ebp) -; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp) -; X86-MMX-NEXT: movl $0, (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v2f32_z1: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-16, %esp -; X86-SSE-NEXT: subl $32, %esp -; X86-SSE-NEXT: movl 8(%ebp), %eax -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: xorps %xmm1, %xmm1 -; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; X86-SSE-NEXT: movaps %xmm0, (%esp) -; X86-SSE-NEXT: movq (%esp), %mm0 -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl -; -; X64-SSE-LABEL: build_v2f32_z1: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: xorps %xmm0, %xmm0 -; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] -; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE-NEXT: paddd %mm0, %mm0 -; X64-SSE-NEXT: movq %mm0, (%rdi) -; X64-SSE-NEXT: retq +; X86-LABEL: build_v2f32_z1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pxor %mm1, %mm1 +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: paddd %mm1, %mm1 +; X86-NEXT: movq %mm1, (%eax) +; X86-NEXT: retl ; -; X64-AVX-LABEL: build_v2f32_z1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX-NEXT: paddd %mm0, %mm0 -; X64-AVX-NEXT: movq %mm0, (%rdi) -; X64-AVX-NEXT: retq +; X64-LABEL: build_v2f32_z1: +; X64: # %bb.0: +; X64-NEXT: movdq2q %xmm1, %mm0 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x float> undef, float 0.0, i32 0 %2 = insertelement <2 x float> %1, float %a1, i32 1 %3 = bitcast <2 x float> %2 to x86_mmx @@ -1754,101 +742,22 @@ } define void @build_v2f32_00(x86_mmx *%p0, float %a0, float %a1) nounwind { -; X86-MMX-LABEL: build_v2f32_00: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: pushl %ebp -; X86-MMX-NEXT: movl %esp, %ebp -; X86-MMX-NEXT: andl $-8, %esp -; X86-MMX-NEXT: subl $8, %esp -; X86-MMX-NEXT: movl 8(%ebp), %eax -; X86-MMX-NEXT: flds 12(%ebp) -; X86-MMX-NEXT: fsts {{[0-9]+}}(%esp) -; X86-MMX-NEXT: fstps (%esp) -; X86-MMX-NEXT: movq (%esp), %mm0 -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: movl %ebp, %esp -; X86-MMX-NEXT: popl %ebp -; X86-MMX-NEXT: retl -; -; X86-SSE2-LABEL: build_v2f32_00: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pushl %ebp -; X86-SSE2-NEXT: movl %esp, %ebp -; X86-SSE2-NEXT: andl $-16, %esp -; X86-SSE2-NEXT: subl $32, %esp -; X86-SSE2-NEXT: movl 8(%ebp), %eax -; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3] -; X86-SSE2-NEXT: movaps %xmm0, (%esp) -; X86-SSE2-NEXT: movq (%esp), %mm0 -; X86-SSE2-NEXT: paddd %mm0, %mm0 -; X86-SSE2-NEXT: movq %mm0, (%eax) -; X86-SSE2-NEXT: movl %ebp, %esp -; X86-SSE2-NEXT: popl %ebp -; X86-SSE2-NEXT: retl -; -; X86-SSSE3-LABEL: build_v2f32_00: -; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: pushl %ebp -; X86-SSSE3-NEXT: movl %esp, %ebp -; X86-SSSE3-NEXT: andl $-16, %esp -; X86-SSSE3-NEXT: subl $32, %esp -; X86-SSSE3-NEXT: movl 8(%ebp), %eax -; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; X86-SSSE3-NEXT: movaps %xmm0, (%esp) -; X86-SSSE3-NEXT: movq (%esp), %mm0 -; X86-SSSE3-NEXT: paddd %mm0, %mm0 -; X86-SSSE3-NEXT: movq %mm0, (%eax) -; X86-SSSE3-NEXT: movl %ebp, %esp -; X86-SSSE3-NEXT: popl %ebp -; X86-SSSE3-NEXT: retl -; -; X64-SSE2-LABEL: build_v2f32_00: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3] -; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSE2-NEXT: paddd %mm0, %mm0 -; X64-SSE2-NEXT: movq %mm0, (%rdi) -; X64-SSE2-NEXT: retq -; -; X64-SSSE3-LABEL: build_v2f32_00: -; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-SSSE3-NEXT: paddd %mm0, %mm0 -; X64-SSSE3-NEXT: movq %mm0, (%rdi) -; X64-SSSE3-NEXT: retq +; X86-LABEL: build_v2f32_00: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X86-NEXT: paddd %mm0, %mm0 +; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: retl ; -; X64-AVX1-LABEL: build_v2f32_00: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX1-NEXT: paddd %mm0, %mm0 -; X64-AVX1-NEXT: movq %mm0, (%rdi) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: build_v2f32_00: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastss %xmm0, %xmm0 -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX2-NEXT: paddd %mm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, (%rdi) -; X64-AVX2-NEXT: retq -; -; X64-AVX512-LABEL: build_v2f32_00: -; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0 -; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 -; X64-AVX512-NEXT: paddd %mm0, %mm0 -; X64-AVX512-NEXT: movq %mm0, (%rdi) -; X64-AVX512-NEXT: retq +; X64-LABEL: build_v2f32_00: +; X64: # %bb.0: +; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] +; X64-NEXT: paddd %mm0, %mm0 +; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: retq %1 = insertelement <2 x float> undef, float %a0, i32 0 %2 = insertelement <2 x float> %1, float %a0, i32 1 %3 = bitcast <2 x float> %2 to x86_mmx Index: test/CodeGen/X86/pr29222.ll =================================================================== --- test/CodeGen/X86/pr29222.ll +++ test/CodeGen/X86/pr29222.ll @@ -10,11 +10,9 @@ ; X86-SSE-NEXT: pushl %ebp ; X86-SSE-NEXT: movl %esp, %ebp ; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $16, %esp -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-SSE-NEXT: subl $8, %esp +; X86-SSE-NEXT: movd 8(%ebp), %mm0 +; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X86-SSE-NEXT: packsswb %mm0, %mm0 ; X86-SSE-NEXT: movq %mm0, (%esp) ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero @@ -29,10 +27,9 @@ ; X86-AVX-NEXT: pushl %ebp ; X86-AVX-NEXT: movl %esp, %ebp ; X86-AVX-NEXT: andl $-8, %esp -; X86-AVX-NEXT: subl $16, %esp -; X86-AVX-NEXT: vbroadcastss 8(%ebp), %xmm0 -; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-AVX-NEXT: subl $8, %esp +; X86-AVX-NEXT: movd 8(%ebp), %mm0 +; X86-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X86-AVX-NEXT: packsswb %mm0, %mm0 ; X86-AVX-NEXT: movq %mm0, (%esp) ; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero @@ -44,10 +41,8 @@ ; ; X64-SSE-LABEL: PR29222: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movd %edi, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-SSE-NEXT: movd %edi, %mm0 +; X64-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X64-SSE-NEXT: packsswb %mm0, %mm0 ; X64-SSE-NEXT: movq2dq %mm0, %xmm0 ; X64-SSE-NEXT: packsswb %xmm0, %xmm0 @@ -56,10 +51,8 @@ ; ; X64-AVX-LABEL: PR29222: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovd %edi, %xmm0 -; X64-AVX-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 +; X64-AVX-NEXT: movd %edi, %mm0 +; X64-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X64-AVX-NEXT: packsswb %mm0, %mm0 ; X64-AVX-NEXT: movq2dq %mm0, %xmm0 ; X64-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -2,16 +2,13 @@ ; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32 ; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64 -; This is not an MMX operation; promoted to xmm. +; This is not an MMX operation on x86_64; promoted to xmm. define x86_mmx @t0(i32 %A) nounwind { ; X32-LABEL: t0: ; X32: ## %bb.0: -; X32-NEXT: subl $12, %esp -; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; X32-NEXT: movq %xmm0, (%esp) -; X32-NEXT: movq (%esp), %mm0 -; X32-NEXT: addl $12, %esp +; X32-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X32-NEXT: pxor %mm0, %mm0 +; X32-NEXT: punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0] ; X32-NEXT: retl ; ; X64-LABEL: t0: Index: test/CodeGen/X86/vector-shuffle-mmx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-mmx.ll +++ test/CodeGen/X86/vector-shuffle-mmx.ll @@ -33,27 +33,38 @@ ; X32: ## %bb.0: ## %entry ; X32-NEXT: pushl %edi ; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: subl $8, %esp -; X32-NEXT: .cfi_def_cfa_offset 16 ; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: pxor %mm0, %mm0 -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movsd %xmm0, (%esp) -; X32-NEXT: movq (%esp), %mm1 +; X32-NEXT: movl $28784, %eax ## imm = 0x7070 +; X32-NEXT: movd %eax, %mm1 +; X32-NEXT: movl $24672, %eax ## imm = 0x6060 +; X32-NEXT: movd %eax, %mm2 +; X32-NEXT: punpcklwd %mm1, %mm2 ## mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X32-NEXT: movl $16448, %eax ## imm = 0x4040 +; X32-NEXT: movd %eax, %mm1 +; X32-NEXT: pxor %mm3, %mm3 +; X32-NEXT: punpcklwd %mm1, %mm3 ## mm3 = mm3[0],mm1[0],mm3[1],mm1[1] +; X32-NEXT: punpckldq %mm2, %mm3 ## mm3 = mm3[0],mm2[0] ; X32-NEXT: xorl %edi, %edi -; X32-NEXT: maskmovq %mm1, %mm0 -; X32-NEXT: addl $8, %esp +; X32-NEXT: maskmovq %mm3, %mm0 ; X32-NEXT: popl %edi ; X32-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: ## %entry ; X64-NEXT: pxor %mm0, %mm0 -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 +; X64-NEXT: movl $28784, %eax ## imm = 0x7070 +; X64-NEXT: movd %eax, %mm1 +; X64-NEXT: movl $24672, %eax ## imm = 0x6060 +; X64-NEXT: movd %eax, %mm2 +; X64-NEXT: punpcklwd %mm1, %mm2 ## mm2 = mm2[0],mm1[0],mm2[1],mm1[1] +; X64-NEXT: movl $16448, %eax ## imm = 0x4040 +; X64-NEXT: movd %eax, %mm1 +; X64-NEXT: pxor %mm3, %mm3 +; X64-NEXT: punpcklwd %mm1, %mm3 ## mm3 = mm3[0],mm1[0],mm3[1],mm1[1] +; X64-NEXT: punpckldq %mm2, %mm3 ## mm3 = mm3[0],mm2[0] ; X64-NEXT: xorl %edi, %edi -; X64-NEXT: maskmovq %mm1, %mm0 +; X64-NEXT: maskmovq %mm3, %mm0 ; X64-NEXT: retq entry: %tmp528 = bitcast <8 x i8> zeroinitializer to <2 x i32>