diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -156,6 +156,8 @@ // `vunpcklpd/vmovlhps r, r` -> `vshufps r, r, 0x44` // `vunpckhpd/vmovlhps r, r` -> `vshufps r, r, 0xee` + // `vunpcklpd r, r, k` -> `vshufpd r, r, 0x00` + // `vunpckhpd r, r, k` -> `vshufpd r, r, 0xff` // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with // `vunpck{l|h}pd` as it uses less code size. // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS` @@ -168,11 +170,12 @@ MI.addOperand(MachineOperand::CreateImm(MaskImm)); return true; }; + auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool { - return ProcessUNPCKPD(NewOpc, 0x44); + return ProcessUNPCKPD(NewOpc, 0x00); }; auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool { - return ProcessUNPCKPD(NewOpc, 0xee); + return ProcessUNPCKPD(NewOpc, 0xff); }; switch (Opc) { @@ -240,23 +243,47 @@ // VMOVLHPS is always 128 bits. case X86::VMOVLHPSZrr: case X86::VUNPCKLPDZ128rr: - return ProcessUNPCKLPDrr(X86::VSHUFPSZ128rri); + return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rri); case X86::VUNPCKLPDZ256rr: - return ProcessUNPCKLPDrr(X86::VSHUFPSZ256rri); + return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rri); case X86::VUNPCKLPDZrr: - return ProcessUNPCKLPDrr(X86::VSHUFPSZrri); + return ProcessUNPCKLPDrr(X86::VSHUFPDZrri); + case X86::VUNPCKLPDZ128rrk: + return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrik); + case X86::VUNPCKLPDZ256rrk: + return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrik); + case X86::VUNPCKLPDZrrk: + return ProcessUNPCKLPDrr(X86::VSHUFPDZrrik); + case X86::VUNPCKLPDZ128rrkz: + return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrikz); + case X86::VUNPCKLPDZ256rrkz: + return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrikz); + case X86::VUNPCKLPDZrrkz: + return ProcessUNPCKLPDrr(X86::VSHUFPDZrrikz); case X86::UNPCKHPDrr: - return ProcessUNPCKHPDrr(X86::SHUFPSrri); + return ProcessUNPCKHPDrr(X86::SHUFPDrri); case X86::VUNPCKHPDrr: - return ProcessUNPCKHPDrr(X86::VSHUFPSrri); + return ProcessUNPCKHPDrr(X86::VSHUFPDrri); case X86::VUNPCKHPDYrr: - return ProcessUNPCKHPDrr(X86::VSHUFPSYrri); + return ProcessUNPCKHPDrr(X86::VSHUFPDYrri); case X86::VUNPCKHPDZ128rr: - return ProcessUNPCKHPDrr(X86::VSHUFPSZ128rri); + return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rri); case X86::VUNPCKHPDZ256rr: - return ProcessUNPCKHPDrr(X86::VSHUFPSZ256rri); + return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rri); case X86::VUNPCKHPDZrr: - return ProcessUNPCKHPDrr(X86::VSHUFPSZrri); + return ProcessUNPCKHPDrr(X86::VSHUFPDZrri); + case X86::VUNPCKHPDZ128rrk: + return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrik); + case X86::VUNPCKHPDZ256rrk: + return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrik); + case X86::VUNPCKHPDZrrk: + return ProcessUNPCKHPDrr(X86::VSHUFPDZrrik); + case X86::VUNPCKHPDZ128rrkz: + return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrikz); + case X86::VUNPCKHPDZ256rrkz: + return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrikz); + case X86::VUNPCKHPDZrrkz: + return ProcessUNPCKHPDrr(X86::VSHUFPDZrrikz); default: return false; } diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll @@ -32,7 +32,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-ICX-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDYrr: @@ -61,7 +61,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-ICX-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDYrr: @@ -90,7 +90,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKLPDrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-ICX-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDrr: @@ -119,7 +119,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKHPDrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,3] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-ICX-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDrr: @@ -164,52 +164,144 @@ ret <8 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <4 x double> @transform_VUNPCKLPDYrrkz(<4 x double> %a, <4 x double> %b, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer ret <4 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <4 x double> @transform_VUNPCKHPDYrrkz(<4 x double> %a, <4 x double> %b, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer ret <4 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <2 x double> @transform_VUNPCKLPDrrkz(<2 x double> %a, <2 x double> %b, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer ret <2 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <2 x double> @transform_VUNPCKHPDrrkz(<2 x double> %a, <2 x double> %b, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrrkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer @@ -242,56 +334,164 @@ ret <8 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <4 x double> @transform_VUNPCKLPDYrrk(<4 x double> %a, <4 x double> %b, <4 x double> %c, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-V4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-AVX512-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ZNVER4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c ret <4 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <4 x double> @transform_VUNPCKHPDYrrk(<4 x double> %a, <4 x double> %b, <4 x double> %c, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-V4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-AVX512-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ZNVER4-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c ret <4 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <2 x double> @transform_VUNPCKLPDrrk(<2 x double> %a, <2 x double> %b, <2 x double> %c, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKLPDrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-V4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDrrk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-AVX512-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-ZNVER4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c ret <2 x double> %res } -; Check that masked vunpcklpd will not be transformed into vshufps. define <2 x double> @transform_VUNPCKHPDrrk(<2 x double> %a, <2 x double> %b, <2 x double> %c, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrrk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrrk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %edi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: transform_VUNPCKHPDrrk: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrrk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %edi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-V4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDrrk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %edi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-AVX512-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrrk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %edi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-ZNVER4-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll @@ -11,7 +11,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1],ymm0[4,5],ymm1[4,5] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-ICX-NEXT: retq %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp @@ -25,7 +25,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3],ymm0[6,7],ymm1[6,7] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-ICX-NEXT: retq %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp @@ -39,7 +39,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKLPDrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-ICX-NEXT: retq %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp @@ -53,7 +53,7 @@ ; ; CHECK-ICX-LABEL: transform_VUNPCKHPDrr: ; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,3] +; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-ICX-NEXT: retq %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp