diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19426,6 +19426,41 @@ Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType())); return UpdateBuildVector(Ops); } + + // If we're inserting into the end of a vector as part of an sequence, see + // if we can create a BUILD_VECTOR be following the sequence back up the + // chain. + if (Elt == (NumElts - 1)) { + SmallVector ReverseInsertions; + ReverseInsertions.push_back(InVal); + + EVT MaxEltVT = InVal.getValueType(); + SDValue CurVec = InVec; + for (unsigned I = 1; I != NumElts; ++I) { + if (CurVec.getOpcode() != ISD::INSERT_VECTOR_ELT || !CurVec.hasOneUse()) + break; + + auto *CurIdx = dyn_cast(CurVec.getOperand(2)); + if (!CurIdx || CurIdx->getAPIntValue() != ((NumElts - 1) - I)) + break; + SDValue CurVal = CurVec.getOperand(1); + ReverseInsertions.push_back(CurVal); + if (VT.isInteger()) { + EVT CurValVT = CurVal.getValueType(); + MaxEltVT = MaxEltVT.bitsGE(CurValVT) ? MaxEltVT : CurValVT; + } + CurVec = CurVec.getOperand(0); + } + + if (ReverseInsertions.size() == NumElts) { + for (unsigned I = 0; I != NumElts; ++I) { + SDValue Val = ReverseInsertions[(NumElts - 1) - I]; + Val = VT.isInteger() ? DAG.getAnyExtOrTrunc(Val, DL, MaxEltVT) : Val; + Ops.push_back(Val); + } + return DAG.getBuildVector(VT, DL, Ops); + } + } } return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -258,9 +258,7 @@ define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { ; CHECK-LABEL: ins2d1: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v1.d[0], v0.d[0] -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 @@ -282,7 +280,7 @@ define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { ; CHECK-LABEL: ins2f1: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v0.2d, v0.d[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = extractelement <2 x double> %tmp1, i32 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -228,7 +228,8 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 { ; VBITS_GE_256-LABEL: insertelement_v1f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: fmov d0, #5.00000000 +; VBITS_GE_256-NEXT: mov x8, #4617315517961601024 +; VBITS_GE_256-NEXT: fmov d0, x8 ; VBITS_GE_256-NEXT: ret %r = insertelement <1 x double> %op1, double 5.0, i64 0 ret <1 x double> %r diff --git a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll @@ -13,7 +13,7 @@ ; CHECK: Legally typed node: [[VTWOA]]: v2f64 = BUILD_VECTOR ; CHECK: Legalizing node: [[VTWOB:t.*]]: v2f64 = BUILD_VECTOR ; CHECK: Legally typed node: [[VTWOB]]: v2f64 = BUILD_VECTOR -; CHECK: Legalizing node: t34: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]] +; CHECK: Legalizing node: t30: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -257,10 +257,7 @@ define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { ; CHECK-LABEL: ins2d1: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov.32 d2[0], r0 -; CHECK-NEXT: vmov.32 d2[1], r1 -; CHECK-NEXT: vorr d0, d2, d2 +; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll @@ -7,13 +7,9 @@ define <1 x i1> @insertelt_v1i1(<1 x i1> %x, i1 %elt) nounwind { ; CHECK-LABEL: insertelt_v1i1: ; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, mu -; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %y = insertelement <1 x i1> %x, i1 %elt, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -41,9 +41,8 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB0_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: lb a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vlse8.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB0_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> %ptrs, i32 1, <1 x i1> %m, <1 x i8> %passthru) @@ -1012,9 +1011,8 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB13_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB13_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> %ptrs, i32 2, <1 x i1> %m, <1 x i16> %passthru) @@ -2325,9 +2323,8 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB27_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB27_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptrs, i32 4, <1 x i1> %m, <1 x i32> %passthru) @@ -7574,9 +7571,8 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB58_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB58_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> %ptrs, i32 2, <1 x i1> %m, <1 x half> %passthru) @@ -8594,9 +8590,8 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB68_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB68_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> %ptrs, i32 4, <1 x i1> %m, <1 x float> %passthru) diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -593,8 +593,9 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { ; CHECK-LABEL: insert_v2i64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq %rdi, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <2 x i64> %x, i64 %val, i32 1 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -2284,38 +2284,40 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 -; KNL_64-NEXT: vmovq %rdi, %xmm2 -; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 -; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; KNL_64-NEXT: vmovq %xmm0, %rax -; KNL_64-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1 -; KNL_64-NEXT: vpextrq $1, %xmm0, %rax -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; KNL_64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; KNL_64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test26: ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 -; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 -; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; KNL_32-NEXT: vmovd %xmm0, %eax -; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1 -; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1 -; KNL_32-NEXT: vpextrd $1, %xmm0, %eax -; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0 -; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0 +; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0 +; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 -; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 -; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; SKX-NEXT: vmovq %xmm0, %rax -; SKX-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1 -; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0 +; SKX-NEXT: vpextrq $1, %xmm0, %rcx +; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; SKX-NEXT: retq ; ; SKX_32-LABEL: test26: @@ -2323,11 +2325,11 @@ ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 ; SKX_32-NEXT: vmovd %xmm0, %eax -; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1 -; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1 -; SKX_32-NEXT: vpextrd $1, %xmm0, %eax -; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0 -; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx +; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0 +; SKX_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0 +; SKX_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll --- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll +++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll @@ -379,26 +379,15 @@ } define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, i64* %s.addr) { -; SSE2-LABEL: insert_i64_two_elts: -; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: insert_i64_two_elts: -; SSE41: # %bb.0: -; SSE41-NEXT: movq (%rdi), %rax -; SSE41-NEXT: pinsrq $0, %rax, %xmm0 -; SSE41-NEXT: pinsrq $1, %rax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: insert_i64_two_elts: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: insert_i64_two_elts: ; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-NEXT: retq %s = load i64, i64* %s.addr %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll --- a/llvm/test/CodeGen/X86/sse-insertelt.ll +++ b/llvm/test/CodeGen/X86/sse-insertelt.ll @@ -352,24 +352,11 @@ } define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, i64 %s) { -; SSE2-LABEL: insert_i64_two_elts: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: movq %rdi, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: insert_i64_two_elts: -; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $0, %rdi, %xmm0 -; SSE41-NEXT: pinsrq $1, %rdi, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: insert_i64_two_elts: -; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-LABEL: insert_i64_two_elts: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 %i1 = insertelement <2 x i64> %i0, i64 %s, i32 1 ret <2 x i64> %i1 diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll --- a/llvm/test/CodeGen/X86/vec_insert-7.ll +++ b/llvm/test/CodeGen/X86/vec_insert-7.ll @@ -8,7 +8,8 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X86-LABEL: mmx_movzl: ; X86: ## %bb.0: -; X86-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 +; X86-NEXT: movl $32, %eax +; X86-NEXT: movd %eax, %mm0 ; X86-NEXT: retl ; ; X64-LABEL: mmx_movzl: