Index: llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -199,6 +199,44 @@ DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); break; + case X86::PSLLDQri: + case X86::VPSLLDQri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSLLDQMask(MVT::v16i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::VPSLLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSLLDQMask(MVT::v32i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::PSRLDQri: + case X86::VPSRLDQri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSRLDQMask(MVT::v16i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::VPSRLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSRLDQMask(MVT::v32i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::PALIGNR128rr: case X86::VPALIGNR128rr: Src1Name = getRegName(MI->getOperand(2).getReg()); Index: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h =================================================================== --- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h +++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -40,6 +40,10 @@ void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask); +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); + void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask); Index: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp =================================================================== --- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -79,6 +79,35 @@ } } +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + int M = SM_SentinelZero; + if (i >= Imm) M = i - Imm + l; + ShuffleMask.push_back(M); + } +} + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + unsigned Base = i + Imm; + int M = Base + l; + if (Base >= NumLaneElts) M = SM_SentinelZero; + ShuffleMask.push_back(M); + } +} + void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -455,21 +455,21 @@ ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: vpslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: vpslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone @@ -551,21 +551,21 @@ ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -158,21 +158,21 @@ ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) { - ; CHECK: vpslldq - %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { - ; CHECK: vpslldq - %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} + + +define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] + %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone + + +define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24] + %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone @@ -254,21 +254,21 @@ ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { - ; CHECK: vpsrldq - %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} + + +define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] + %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone + + +define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero + %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -408,21 +408,21 @@ ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: pslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: pslldq - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { + ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { + ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone @@ -504,21 +504,21 @@ ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: psrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: psrldq - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { + ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { + ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1397,77 +1397,77 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_z8zzzzzz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq $2, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_z8zzzzzz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq $2, %xmm0, %xmm0 -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z8zzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzzzz8zz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq $10, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zzzzz8zz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq $10, %xmm0, %xmm0 -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzzzz8zz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zuuzuuz8: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq $14, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zuuzuuz8: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq $14, %xmm0, %xmm0 -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 0 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zuuzuuz8: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzBzzzzz: -; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq $4, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_zzBzzzzz: -; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq $4, %xmm0, %xmm0 -; AVX-NEXT: retq - %a = insertelement <8 x i16> undef, i16 %i, i32 3 - %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzBzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 3 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle }