Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -10000,7 +10000,9 @@ // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); - unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; + unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. @@ -10086,7 +10088,9 @@ // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); - Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode); + Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : Opcode; } // If we are broadcasting a load that is only used by the shuffle @@ -27317,7 +27321,7 @@ // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(Mask, {0, 0})) { + if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; @@ -36412,6 +36416,7 @@ case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: case X86ISD::VPERMV: Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -9100,7 +9100,7 @@ //===----------------------------------------------------------------------===// multiclass avx512_movddup_128 opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { - defm Z : avx512_unary_rm, EVEX_V512; + defm Z : avx512_unary_rm, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_unary_rm, + defm Z256 : avx512_unary_rm, EVEX_V256; - defm Z128 : avx512_movddup_128, - EVEX_V128; + defm Z128 : avx512_movddup_128, + EVEX_V128; } } @@ -9134,19 +9134,12 @@ defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; let Predicates = [HasVLX] in { -def : Pat<(X86Movddup (loadv2f64 addr:$src)), - (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (v2f64 VR128X:$src0)), - (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (bitconvert (v4i32 immAllZerosV))), - (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; +def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (v2f64 VR128X:$src0)), @@ -9162,6 +9155,13 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), (bitconvert (v4i32 immAllZerosV))), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -7969,6 +7969,11 @@ (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPrm addr:$src)>; + + def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), + (VMOVDDUPrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPrm addr:$src)>; } let Predicates = [HasAVX1Only] in { Index: llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-vbroadcast.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+avx512vl| FileCheck %s declare void @func_f32(float) @@ -170,3 +170,30 @@ %r = select <4 x i1> %mask, <4 x double> %c, <4 x double> zeroinitializer ret <4 x double> %r } + +define <2 x double> @test_v2f64_broadcast_fold(<2 x double> *%a0, <2 x double> %a1) { +; CHECK-LABEL: test_v2f64_broadcast_fold: +; CHECK: # BB#0: +; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %3 = fadd <2 x double> %2, %a1 + ret <2 x double> %3 +} + +define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) { +; CHECK-LABEL: test_v2f64_broadcast_fold_mask: +; CHECK: # BB#0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpneqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-NEXT: retq + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %3 = fadd <2 x double> %2, %a1 + %4 = select <2 x i1> %mask, <2 x double> %3, <2 x double> %a2 + ret <2 x double> %4 +}