Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -2420,6 +2420,41 @@ MI->getOperand(3).setImm(Size-Amt); return TargetInstrInfo::commuteInstruction(MI, NewMI); } + case X86::BLENDPDrri: + case X86::BLENDPSrri: + case X86::PBLENDWrri: + case X86::VBLENDPDrri: + case X86::VBLENDPSrri: + case X86::VBLENDPDYrri: + case X86::VBLENDPSYrri: + case X86::VPBLENDDrri: + case X86::VPBLENDWrri: + case X86::VPBLENDDYrri: + case X86::VPBLENDWYrri:{ + unsigned Mask; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::BLENDPDrri: Mask = 0x03; break; + case X86::BLENDPSrri: Mask = 0x0F; break; + case X86::PBLENDWrri: Mask = 0xFF; break; + case X86::VBLENDPDrri: Mask = 0x03; break; + case X86::VBLENDPSrri: Mask = 0x0F; break; + case X86::VBLENDPDYrri: Mask = 0x0F; break; + case X86::VBLENDPSYrri: Mask = 0xFF; break; + case X86::VPBLENDDrri: Mask = 0x0F; break; + case X86::VPBLENDWrri: Mask = 0xFF; break; + case X86::VPBLENDDYrri: Mask = 0xFF; break; + case X86::VPBLENDWYrri: Mask = 0xFF; break; + } + unsigned Imm = MI->getOperand(3).getImm(); + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Mask ^ Imm); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: @@ -2504,6 +2539,20 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { + case X86::BLENDPDrri: + case X86::BLENDPSrri: + case X86::PBLENDWrri: + case X86::VBLENDPDrri: + case X86::VBLENDPSrri: + case X86::VBLENDPDYrri: + case X86::VBLENDPSYrri: + case X86::VPBLENDDrri: + case X86::VPBLENDDYrri: + case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: + SrcOpIdx1 = 1; + SrcOpIdx2 = 2; + return true; case X86::VFMADDPDr231r: case X86::VFMADDPSr231r: case X86::VFMADDSDr231r: Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -7537,31 +7537,33 @@ let Predicates = [HasAVX] in { let isCommutable = 0 in { - let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } - let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V; + } + + let ExeDomain = SSEPackedSingle in { + defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", + int_x86_avx_blend_ps_256, VR256, loadv8f32, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", + int_x86_avx_blend_pd_256,VR256, loadv4f64, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, VR128, loadv2i64, i128mem, 0, DEFAULT_ITINS_BLENDSCHED>, VEX_4V; - defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_MPSADSCHED>, VEX_4V; - } + let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, @@ -7589,6 +7591,10 @@ let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem, + 1, SSE_MPSADBW_ITINS>; + } let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, VR128, memopv4f32, f128mem, @@ -7600,10 +7606,6 @@ defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, 1, SSE_INTALU_ITINS_BLEND_P>; - defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem, - 1, SSE_MPSADBW_ITINS>; - } let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -8827,12 +8829,10 @@ Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } -let isCommutable = 0 in { defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, VR128, loadv2i64, i128mem>; defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, VR256, loadv4i64, i256mem>, VEX_L; -} def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$mask)), Index: test/CodeGen/X86/combine-or.ll =================================================================== --- test/CodeGen/X86/combine-or.ll +++ test/CodeGen/X86/combine-or.ll @@ -19,8 +19,7 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test2: ; CHECK: # BB#0: -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -32,8 +31,7 @@ define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test3: ; CHECK: # BB#0: -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> @@ -45,8 +43,7 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test4: ; CHECK: # BB#0: -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -106,8 +103,7 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test9: ; CHECK: # BB#0: -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; CHECK-NEXT: retq %and1 = and <4 x i32> %a, %and2 = and <4 x i32> %b, @@ -119,8 +115,7 @@ define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test10: ; CHECK: # BB#0: -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; CHECK-NEXT: retq %and1 = and <2 x i64> %a, %and2 = and <2 x i64> %b, @@ -132,8 +127,7 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test11: ; CHECK: # BB#0: -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; CHECK-NEXT: retq %and1 = and <4 x i32> %a, %and2 = and <4 x i32> %b, @@ -230,12 +224,10 @@ ; CHECK-LABEL: test18: ; CHECK: # BB#0: ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: blendps {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1] -; CHECK-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; CHECK-NEXT: orps %xmm0, %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -295,8 +287,7 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) { ; CHECK-LABEL: test_crash: ; CHECK: # BB#0: -; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32> Index: test/CodeGen/X86/commute-blend-avx2.ll =================================================================== --- test/CodeGen/X86/commute-blend-avx2.ll +++ test/CodeGen/X86/commute-blend-avx2.ll @@ -0,0 +1,89 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s + +define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 { + %1 = load <8 x i16>* %b + %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17) + ret <8 x i16> %2 + + ;LABEL: commute_fold_vpblendw_128 + ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 { + %1 = load <16 x i16>* %b + %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17) + ret <16 x i16> %2 + + ;LABEL: commute_fold_vpblendw_256 + ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] + ;CHECK-NEXT: retq +} +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 { + %1 = load <4 x i32>* %b + %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1) + ret <4 x i32> %2 + + ;LABEL: commute_fold_vpblendd_128 + ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] + ;CHECK-NEXT: retq +} +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone + +define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 { + %1 = load <8 x i32>* %b + %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129) + ret <8 x i32> %2 + + ;LABEL: commute_fold_vpblendd_256 + ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7] + ;CHECK-NEXT: retq +} +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone + +define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 { + %1 = load <4 x float>* %b + %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3) + ret <4 x float> %2 + + ;LABEL: commute_fold_vblendps_128 + ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] + ;CHECK-NEXT: retq +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 { + %1 = load <8 x float>* %b + %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7) + ret <8 x float> %2 + + ;LABEL: commute_fold_vblendps_256 + ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 { + %1 = load <2 x double>* %b + %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) + ret <2 x double> %2 + + ;LABEL: commute_fold_vblendpd_128 + ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] + ;CHECK-NEXT: retq +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 { + %1 = load <4 x double>* %b + %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7) + ret <4 x double> %2 + + ;LABEL: commute_fold_vblendpd_256 + ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3] + ;CHECK-NEXT: retq +} +declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone Index: test/CodeGen/X86/commute-blend-sse41.ll =================================================================== --- test/CodeGen/X86/commute-blend-sse41.ll +++ test/CodeGen/X86/commute-blend-sse41.ll @@ -0,0 +1,34 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s + +define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 { + %1 = load <8 x i16>* %b + %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17) + ret <8 x i16> %2 + + ;LABEL: commute_fold_pblendw + ;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 { + %1 = load <4 x float>* %b + %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3) + ret <4 x float> %2 + + ;LABEL: commute_fold_blendps + ;CHECK: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] + ;CHECK-NEXT: retq +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 { + %1 = load <2 x double>* %b + %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) + ret <2 x double> %2 + + ;LABEL: commute_fold_vblendpd + ;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] + ;CHECK-NEXT: retq +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone Index: test/CodeGen/X86/vector-blend.ll =================================================================== --- test/CodeGen/X86/vector-blend.ll +++ test/CodeGen/X86/vector-blend.ll @@ -202,8 +202,7 @@ ; ; SSE41-LABEL: vsel_8xi16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_8xi16: @@ -518,10 +517,8 @@ ; ; SSE41-LABEL: constant_blendvps_avx: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] -; SSE41-NEXT: blendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_blendvps_avx: @@ -637,10 +634,8 @@ ; ; SSE41-LABEL: blend_shufflevector_8xfloat: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE41-NEXT: blendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: blend_shufflevector_8xfloat: @@ -694,8 +689,7 @@ ; ; SSE41-LABEL: blend_shufflevector_4xi64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: movaps %xmm3, %xmm1 ; SSE41-NEXT: retq ; Index: test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v2.ll +++ test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -260,8 +260,7 @@ ; ; SSE41-LABEL: shuffle_v2f64_21: ; SSE41: # BB#0: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_21: @@ -508,8 +507,7 @@ ; ; SSE41-LABEL: shuffle_v2i64_21: ; SSE41: # BB#0: -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: shuffle_v2i64_21: @@ -545,8 +543,8 @@ ; ; SSE41-LABEL: shuffle_v2i64_21_copy: ; SSE41: # BB#0: -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: shuffle_v2i64_21_copy: Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -369,7 +369,8 @@ ; SSE41: # BB#0: ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test3b: @@ -411,8 +412,7 @@ ; SSE41-LABEL: combine_bitwise_ops_test4b: ; SSE41: # BB#0: ; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test4b: @@ -452,8 +452,7 @@ ; SSE41-LABEL: combine_bitwise_ops_test5b: ; SSE41: # BB#0: ; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test5b: @@ -1170,8 +1169,7 @@ ; ; SSE41-LABEL: combine_test2: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2: @@ -1237,8 +1235,7 @@ ; ; SSE41-LABEL: combine_test5: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test5: @@ -1299,8 +1296,7 @@ ; ; SSE41-LABEL: combine_test7: ; SSE41: # BB#0: -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_test7: @@ -1371,8 +1367,7 @@ ; ; SSE41-LABEL: combine_test10: ; SSE41: # BB#0: -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_test10: @@ -1415,8 +1410,7 @@ ; ; SSE41-LABEL: combine_test12: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test12: @@ -1479,8 +1473,7 @@ ; ; SSE41-LABEL: combine_test15: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test15: @@ -1518,8 +1511,7 @@ ; ; SSE41-LABEL: combine_test17: ; SSE41: # BB#0: -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_test17: @@ -1587,8 +1579,7 @@ ; ; SSE41-LABEL: combine_test20: ; SSE41: # BB#0: -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_test20: @@ -1632,10 +1623,9 @@ ; ; SSE41-LABEL: combine_test1b: ; SSE41: # BB#0: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1673,11 +1663,9 @@ ; ; SSE41-LABEL: combine_test2b: ; SSE41: # BB#0: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1] -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2b: @@ -1736,10 +1724,9 @@ ; ; SSE41-LABEL: combine_test4b: ; SSE41: # BB#0: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,2] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -2006,8 +1993,7 @@ ; ; SSE41-LABEL: combine_blend_01: ; SSE41: # BB#0: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_blend_01: @@ -2036,8 +2022,7 @@ ; ; SSE41-LABEL: combine_blend_02: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_blend_02: @@ -2070,8 +2055,7 @@ ; ; SSE41-LABEL: combine_blend_123: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_blend_123: @@ -2153,8 +2137,7 @@ ; ; SSE41-LABEL: combine_undef_input_test1: ; SSE41: # BB#0: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test1: @@ -2343,8 +2326,7 @@ ; ; SSE41-LABEL: combine_undef_input_test11: ; SSE41: # BB#0: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test11: