diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7626,6 +7626,10 @@ (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; + def : Pat<(v4f64 (X86VBroadcast v2f64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm), + (v2f64 (VMOVDDUPrr VR128:$src)), 1)>; def : Pat<(v4i32 (X86VBroadcast GR32:$src)), (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 define void @PR32957(<2 x float>* %in, <8 x float>* %out) { ; CHECK-LABEL: PR32957: @@ -38,3 +38,50 @@ %extr1 = extractelement <2 x float> %bc, i64 0 unreachable } + +@qa_ = external unnamed_addr global [49216 x i8], align 32 + +define void @concat_of_broadcast_v2f64_v4f64() { +; AVX1-LABEL: concat_of_broadcast_v2f64_v4f64: +; AVX1: # %bb.0: # %alloca_0 +; AVX1-NEXT: movq qa_@GOTPCREL(%rip), %rax +; AVX1-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 +; AVX1-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 +; AVX1-NEXT: movq %rcx, 46348(%rax) +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u> +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vmovups %ymm1, 48296(%rax) +; AVX1-NEXT: vmovlps %xmm0, 47372(%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: concat_of_broadcast_v2f64_v4f64: +; AVX2: # %bb.0: # %alloca_0 +; AVX2-NEXT: movq qa_@GOTPCREL(%rip), %rax +; AVX2-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 +; AVX2-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 +; AVX2-NEXT: movq %rcx, 46348(%rax) +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm1 +; AVX2-NEXT: vmovups %ymm1, 48296(%rax) +; AVX2-NEXT: vmovlps %xmm0, 47372(%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +alloca_0: + store float 9.000000e+00, float* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 30256) to float*), align 16 + store <2 x i32> , <2 x i32>* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 46348) to <2 x i32>*), align 4 + br label %loop.4942 + +loop.4942: ; preds = %loop.4942, %alloca_0 + br i1 undef, label %loop.4942, label %ifmerge.1298 + +ifmerge.1298: ; preds = %loop.4942 + %gepload4638 = load float, float* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 28324) to float*), align 4 + store <2 x float> , <2 x float>* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 48296) to <2 x float>*), align 8 + store <2 x float> , <2 x float>* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 48304) to <2 x float>*), align 16 + store <2 x float> , <2 x float>* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 48312) to <2 x float>*), align 8 + store <2 x float> , <2 x float>* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 48320) to <2 x float>*), align 32 + store <2 x float> , <2 x float>* bitcast (i8* getelementptr inbounds ([49216 x i8], [49216 x i8]* @qa_, i64 0, i64 47372) to <2 x float>*), align 4 + ret void +}