diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -154,14 +154,18 @@ return true; }; - // `vunpcklpd/vmovlhps r, r` -> `vshufpd r, r, 0x00` - // `vunpckhpd/vmovlhps r, r` -> `vshufpd r, r, 0xff` - // `vunpcklpd r, r, k` -> `vshufpd r, r, 0x00` - // `vunpckhpd r, r, k` -> `vshufpd r, r, 0xff` - // iff `vshufps` is faster than `vunpck{l|h}pd`. Otherwise stick with - // `vunpck{l|h}pd` as it uses less code size. - // TODO: Look into using `{VP}UNPCK{L|H}QDQ{...}` instead of `{V}SHUF{...}PS` - // as the replacement. `{VP}UNPCK{L|H}QDQ{...}` has no codesize cost. + // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00` + // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff` + // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00` + // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff` + // `vunpcklpd r, m` -> `vunpcklqdq r, m, k` + // `vunpckhpd r, m` -> `vunpckhqdq r, m, k` + // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k` + // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k` + // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd` + // -> `vunpck{l|h}qdq` + // 2) If `vshufpd` faster than `vunpck{l|h}pd` + // -> `vshufpd` auto ProcessUNPCKPD = [&](unsigned NewOpc, unsigned MaskImm) -> bool { if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) return false; @@ -171,13 +175,34 @@ return true; }; - auto ProcessUNPCKLPDrr = [&](unsigned NewOpc) -> bool { + auto ProcessUNPCKPDToIntDomain = [&](unsigned NewOpc) -> bool { + // TODO it may be worth it to set ReplaceInTie to `true` as there is no real + // downside to the integer unpck, but if someone doesn't specify exact + // target we won't find it faster. + if (!ST->hasNoDomainDelayShuffle() || + !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false)) + return false; + MI.setDesc(TII->get(NewOpc)); + return true; + }; + + auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain, + unsigned NewOpc) -> bool { + if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain)) + return true; return ProcessUNPCKPD(NewOpc, 0x00); }; - auto ProcessUNPCKHPDrr = [&](unsigned NewOpc) -> bool { + auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain, + unsigned NewOpc) -> bool { + if (ProcessUNPCKPDToIntDomain(NewOpcIntDomain)) + return true; return ProcessUNPCKPD(NewOpc, 0xff); }; + auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool { + return ProcessUNPCKPDToIntDomain(NewOpcIntDomain); + }; + switch (Opc) { case X86::VPERMILPSri: return ProcessVPERMILPSri(X86::VSHUFPSrri); @@ -226,64 +251,106 @@ case X86::VPERMILPSZmik: return ProcessVPERMILPSmi(X86::VPSHUFDZmik); - // TODO: {V}UNPCK{L|H}PD{...} is probably safe to transform to - // `{VP}UNPCK{L|H}QDQ{...}` which gets the same perf benefit as - // `{V}SHUF{...}PS` but 1) without increasing code size and 2) can also - // handle the `mr` case. ICL doesn't have a domain penalty for replacing - // float unpck -> int unpck, but at this time, I haven't verified the set of - // processors where its safe. case X86::MOVLHPSrr: case X86::UNPCKLPDrr: - return ProcessUNPCKLPDrr(X86::SHUFPDrri); + return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri); case X86::VMOVLHPSrr: case X86::VUNPCKLPDrr: - return ProcessUNPCKLPDrr(X86::VSHUFPDrri); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri); case X86::VUNPCKLPDYrr: - return ProcessUNPCKLPDrr(X86::VSHUFPDYrri); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri); // VMOVLHPS is always 128 bits. case X86::VMOVLHPSZrr: case X86::VUNPCKLPDZ128rr: - return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rri); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri); case X86::VUNPCKLPDZ256rr: - return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rri); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri); case X86::VUNPCKLPDZrr: - return ProcessUNPCKLPDrr(X86::VSHUFPDZrri); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri); case X86::VUNPCKLPDZ128rrk: - return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrik); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik); case X86::VUNPCKLPDZ256rrk: - return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrik); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik); case X86::VUNPCKLPDZrrk: - return ProcessUNPCKLPDrr(X86::VSHUFPDZrrik); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik); case X86::VUNPCKLPDZ128rrkz: - return ProcessUNPCKLPDrr(X86::VSHUFPDZ128rrikz); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz); case X86::VUNPCKLPDZ256rrkz: - return ProcessUNPCKLPDrr(X86::VSHUFPDZ256rrikz); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz); case X86::VUNPCKLPDZrrkz: - return ProcessUNPCKLPDrr(X86::VSHUFPDZrrikz); + return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz); case X86::UNPCKHPDrr: - return ProcessUNPCKHPDrr(X86::SHUFPDrri); + return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri); case X86::VUNPCKHPDrr: - return ProcessUNPCKHPDrr(X86::VSHUFPDrri); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri); case X86::VUNPCKHPDYrr: - return ProcessUNPCKHPDrr(X86::VSHUFPDYrri); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri); case X86::VUNPCKHPDZ128rr: - return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rri); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri); case X86::VUNPCKHPDZ256rr: - return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rri); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri); case X86::VUNPCKHPDZrr: - return ProcessUNPCKHPDrr(X86::VSHUFPDZrri); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri); case X86::VUNPCKHPDZ128rrk: - return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrik); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik); case X86::VUNPCKHPDZ256rrk: - return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrik); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik); case X86::VUNPCKHPDZrrk: - return ProcessUNPCKHPDrr(X86::VSHUFPDZrrik); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik); case X86::VUNPCKHPDZ128rrkz: - return ProcessUNPCKHPDrr(X86::VSHUFPDZ128rrikz); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz); case X86::VUNPCKHPDZ256rrkz: - return ProcessUNPCKHPDrr(X86::VSHUFPDZ256rrikz); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz); case X86::VUNPCKHPDZrrkz: - return ProcessUNPCKHPDrr(X86::VSHUFPDZrrikz); + return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz); + case X86::UNPCKLPDrm: + return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm); + case X86::VUNPCKLPDrm: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm); + case X86::VUNPCKLPDYrm: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm); + case X86::VUNPCKLPDZ128rm: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm); + case X86::VUNPCKLPDZ256rm: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm); + case X86::VUNPCKLPDZrm: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm); + case X86::VUNPCKLPDZ128rmk: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk); + case X86::VUNPCKLPDZ256rmk: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk); + case X86::VUNPCKLPDZrmk: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk); + case X86::VUNPCKLPDZ128rmkz: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz); + case X86::VUNPCKLPDZ256rmkz: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz); + case X86::VUNPCKLPDZrmkz: + return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz); + case X86::UNPCKHPDrm: + return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm); + case X86::VUNPCKHPDrm: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm); + case X86::VUNPCKHPDYrm: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm); + case X86::VUNPCKHPDZ128rm: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm); + case X86::VUNPCKHPDZ256rm: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm); + case X86::VUNPCKHPDZrm: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm); + case X86::VUNPCKHPDZ128rmk: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk); + case X86::VUNPCKHPDZ256rmk: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk); + case X86::VUNPCKHPDZrmk: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk); + case X86::VUNPCKHPDZ128rmkz: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz); + case X86::VUNPCKHPDZ256rmkz: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz); + case X86::VUNPCKHPDZrmkz: + return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz); default: return false; } diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll @@ -31,10 +31,15 @@ ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDYrr: ; CHECK-V4: # %bb.0: @@ -60,10 +65,15 @@ ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDYrr: ; CHECK-V4: # %bb.0: @@ -89,10 +99,15 @@ ; CHECK-SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDrr: ; CHECK-V4: # %bb.0: @@ -118,10 +133,15 @@ ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDrr: ; CHECK-V4: # %bb.0: @@ -172,11 +192,17 @@ ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrkz: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDYrrkz: ; CHECK-V4: # %bb.0: @@ -208,11 +234,17 @@ ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrkz: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDYrrkz: ; CHECK-V4: # %bb.0: @@ -244,11 +276,17 @@ ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDrrkz: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDrrkz: ; CHECK-V4: # %bb.0: @@ -280,11 +318,17 @@ ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDrrkz: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDrrkz: ; CHECK-V4: # %bb.0: @@ -343,12 +387,19 @@ ; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDYrrk: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDYrrk: ; CHECK-V4: # %bb.0: @@ -384,12 +435,19 @@ ; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDYrrk: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-ICX-NEXT: vmovapd %ymm2, %ymm0 -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDYrrk: ; CHECK-V4: # %bb.0: @@ -425,12 +483,19 @@ ; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0 ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDrrk: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] -; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDrrk: ; CHECK-V4: # %bb.0: @@ -466,12 +531,19 @@ ; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0 ; CHECK-SKX-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDrrk: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: kmovd %edi, %k1 -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] -; CHECK-ICX-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKHPDrrk: ; CHECK-V4: # %bb.0: @@ -520,40 +592,140 @@ } define <8 x float> @transform_VUNPCKLPDYrm(<8 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrm: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrm: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrm: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrm: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ZNVER4-NEXT: retq %b = load <8 x float>, ptr %pb %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp } define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrm: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrm: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrm: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrm: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ZNVER4-NEXT: retq %b = load <8 x float>, ptr %pb %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp } define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrm: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrm: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDrm: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrm: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-ZNVER4-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrm: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrm: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDrm: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrm: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-ZNVER4-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp @@ -586,11 +758,41 @@ } define <4 x double> @transform_VUNPCKLPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrmkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrmkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %b = load <4 x double>, ptr %pb %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -599,11 +801,41 @@ } define <4 x double> @transform_VUNPCKHPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrmkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrmkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %b = load <4 x double>, ptr %pb %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -612,11 +844,41 @@ } define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrmkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrmkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrmkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %b = load <2 x double>, ptr %pb %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -625,11 +887,41 @@ } define <2 x double> @transform_VUNPCKHPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrmkz: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrmkz: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrmkz: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmkz: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmkz: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %b = load <2 x double>, ptr %pb %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -666,12 +958,47 @@ } define <4 x double> @transform_VUNPCKLPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrmk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-SKX-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDYrmk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-V4-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-AVX512-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ZNVER4-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %b = load <4 x double>, ptr %pb %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -680,12 +1007,47 @@ } define <4 x double> @transform_VUNPCKHPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrmk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-SKX-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDYrmk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-V4-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-AVX512-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ZNVER4-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i4 %mask_int to <4 x i1> %b = load <4 x double>, ptr %pb %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -694,12 +1056,47 @@ } define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrmk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKLPDrmk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKLPDrmk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %b = load <2 x double>, ptr %pb %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -708,12 +1105,47 @@ } define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrmk: -; CHECK: # %bb.0: -; CHECK-NEXT: kmovd %esi, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: transform_VUNPCKHPDrmk: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: kmovd %esi, %k1 +; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-V4-LABEL: transform_VUNPCKHPDrmk: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: kmovd %esi, %k1 +; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmk: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: kmovd %esi, %k1 +; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-AVX512-NEXT: retq +; +; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmk: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 +; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %b = load <2 x double>, ptr %pb %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -721,5 +1153,4 @@ ret <2 x double> %res } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-ICX-BYPASS-DELAY: {{.*}} -; CHECK-ICX-NO-BYPASS-DELAY: {{.*}} +; CHECK-ICX: {{.*}} diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll @@ -12,10 +12,15 @@ ; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-AVX2-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDYrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-SNB-LABEL: transform_VUNPCKLPDYrr: ; CHECK-SNB: # %bb.0: @@ -31,10 +36,15 @@ ; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; CHECK-AVX2-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDYrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-SNB-LABEL: transform_VUNPCKHPDYrr: ; CHECK-SNB: # %bb.0: @@ -50,15 +60,25 @@ ; CHECK-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-AVX2-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKLPDrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq ; -; CHECK-SNB-LABEL: transform_VUNPCKLPDrr: -; CHECK-SNB: # %bb.0: -; CHECK-SNB-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-SNB-NEXT: retq +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr: +; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr: +; CHECK-SNB-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-BYPASS-DELAY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SNB-BYPASS-DELAY-NEXT: retq %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } @@ -69,62 +89,140 @@ ; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-AVX2-NEXT: retq ; -; CHECK-ICX-LABEL: transform_VUNPCKHPDrr: -; CHECK-ICX: # %bb.0: -; CHECK-ICX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-ICX-NEXT: retq +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq ; -; CHECK-SNB-LABEL: transform_VUNPCKHPDrr: -; CHECK-SNB: # %bb.0: -; CHECK-SNB-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-SNB-NEXT: retq +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr: +; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr: +; CHECK-SNB-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-SNB-BYPASS-DELAY-NEXT: retq %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } define <8 x float> @transform_VUNPCKLPDYrm(<8 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDYrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; CHECK-NEXT: retq +; CHECK-AVX2-LABEL: transform_VUNPCKLPDYrm: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-AVX2-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-LABEL: transform_VUNPCKLPDYrm: +; CHECK-SNB: # %bb.0: +; CHECK-SNB-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-SNB-NEXT: retq %b = load <8 x float>, ptr %pb %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp } define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDYrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; CHECK-NEXT: retq +; CHECK-AVX2-LABEL: transform_VUNPCKHPDYrm: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-AVX2-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-LABEL: transform_VUNPCKHPDYrm: +; CHECK-SNB: # %bb.0: +; CHECK-SNB-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-SNB-NEXT: retq %b = load <8 x float>, ptr %pb %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shufp } define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKLPDrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: retq +; CHECK-AVX2-LABEL: transform_VUNPCKLPDrm: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-AVX2-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: +; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: +; CHECK-SNB-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-SNB-BYPASS-DELAY-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind { -; CHECK-LABEL: transform_VUNPCKHPDrm: -; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-NEXT: retq +; CHECK-AVX2-LABEL: transform_VUNPCKHPDrm: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-AVX2-NEXT: retq +; +; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: +; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: +; CHECK-ICX-BYPASS-DELAY: # %bb.0: +; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: +; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq +; +; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: +; CHECK-SNB-BYPASS-DELAY: # %bb.0: +; CHECK-SNB-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-SNB-BYPASS-DELAY-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-ICX-BYPASS-DELAY: {{.*}} -; CHECK-ICX-NO-BYPASS-DELAY: {{.*}} +; CHECK: {{.*}} +; CHECK-ICX: {{.*}} ; CHECK-SKL: {{.*}} -; CHECK-SNB-BYPASS-DELAY: {{.*}} -; CHECK-SNB-NO-BYPASS-DELAY: {{.*}} ; CHECK-V3: {{.*}}