Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10000,7 +10000,9 @@ // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); - unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; + unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. @@ -10086,7 +10088,9 @@ // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); - Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode); + Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : Opcode; } // If we are broadcasting a load that is only used by the shuffle @@ -27317,7 +27321,7 @@ // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(Mask, {0, 0})) { + if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; @@ -36412,6 +36416,7 @@ case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: case X86ISD::VPERMV: Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -9100,7 +9100,7 @@ //===----------------------------------------------------------------------===// multiclass avx512_movddup_128 opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { - defm Z : avx512_unary_rm, EVEX_V512; + defm Z : avx512_unary_rm, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_unary_rm, + defm Z256 : avx512_unary_rm, EVEX_V256; - defm Z128 : avx512_movddup_128, - EVEX_V128; + defm Z128 : avx512_movddup_128, + EVEX_V128; } } @@ -9134,19 +9134,12 @@ defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; let Predicates = [HasVLX] in { -def : Pat<(X86Movddup (loadv2f64 addr:$src)), - (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (v2f64 VR128X:$src0)), - (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (bitconvert (v4i32 immAllZerosV))), - (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; +def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (v2f64 VR128X:$src0)), @@ -9162,6 +9155,13 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), (bitconvert (v4i32 immAllZerosV))), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -7969,6 +7969,11 @@ (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPrm addr:$src)>; + + def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), + (VMOVDDUPrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPrm addr:$src)>; } let Predicates = [HasAVX1Only] in { Index: test/CodeGen/X86/avx512vl-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx512vl-vbroadcast.ll +++ test/CodeGen/X86/avx512vl-vbroadcast.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+avx512vl| FileCheck %s declare void @func_f32(float) @@ -170,3 +170,30 @@ %r = select <4 x i1> %mask, <4 x double> %c, <4 x double> zeroinitializer ret <4 x double> %r } + +define <2 x double> @test_v2f64_broadcast_fold(<2 x double> *%a0, <2 x double> %a1) { +; CHECK-LABEL: test_v2f64_broadcast_fold: +; CHECK: # BB#0: +; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %3 = fadd <2 x double> %2, %a1 + ret <2 x double> %3 +} + +define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) { +; CHECK-LABEL: test_v2f64_broadcast_fold_mask: +; CHECK: # BB#0: +; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpcmpneqq %xmm3, %xmm1, %k1 +; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-NEXT: retq + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer + %3 = fadd <2 x double> %2, %a1 + %4 = select <2 x i1> %mask, <2 x double> %3, <2 x double> %a2 + ret <2 x double> %4 +} Index: test/CodeGen/X86/sse3-schedule.ll =================================================================== --- test/CodeGen/X86/sse3-schedule.ll +++ test/CodeGen/X86/sse3-schedule.ll @@ -501,14 +501,14 @@ ; GENERIC: # BB#0: ; GENERIC-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00] ; GENERIC-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] sched: [6:0.50] -; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; ATOM-LABEL: test_movddup: ; ATOM: # BB#0: ; ATOM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] sched: [1:1.00] ; ATOM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] -; ATOM-NEXT: addpd %xmm0, %xmm1 # sched: [6:3.00] +; ATOM-NEXT: subpd %xmm0, %xmm1 # sched: [6:3.00] ; ATOM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; ATOM-NEXT: retq # sched: [79:39.50] ; @@ -516,54 +516,54 @@ ; SLM: # BB#0: ; SLM-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00] ; SLM-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] sched: [3:1.00] -; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movddup: ; SANDY: # BB#0: ; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] ; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50] -; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movddup: ; HASWELL: # BB#0: ; HASWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] ; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [1:0.50] -; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: retq # sched: [2:1.00] ; ; SKYLAKE-LABEL: test_movddup: ; SKYLAKE: # BB#0: ; SKYLAKE-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] ; SKYLAKE-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [1:0.50] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [2:1.00] ; ; SKX-LABEL: test_movddup: ; SKX: # BB#0: ; SKX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] ; SKX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:0.50] -; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; ; BTVER2-LABEL: test_movddup: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:1.00] ; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] -; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movddup: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50] ; ZNVER1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer %2 = load <2 x double>, <2 x double> *%a1, align 16 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer - %4 = fadd <2 x double> %1, %3 + %4 = fsub <2 x double> %3, %1 ; stop the movddup from being folded as a broadcast load ret <2 x double> %4 }