Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15458,9 +15458,12 @@ unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements(); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), NumElts); - SmallVector Ops(N->getNumOperands(), Splat); - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), NewVT, Ops); - return DAG.getBitcast(VT, Concat); + if (!LegalTypes || TLI.isTypeLegal(NewVT)) { + SmallVector Ops(N->getNumOperands(), Splat); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), + NewVT, Ops); + return DAG.getBitcast(VT, Concat); + } } } } Index: test/CodeGen/AMDGPU/concat_vectors.ll =================================================================== --- test/CodeGen/AMDGPU/concat_vectors.ll +++ test/CodeGen/AMDGPU/concat_vectors.ll @@ -1,291 +1,291 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s -; FUNC-LABEL: {{^}}test_concat_v1i32: +; GCN-LABEL: {{^}}test_concat_v1i32: ; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF ; instructions that access scratch memory. Bit 23, which is the add_tid_enable ; bit, is only set for scratch access, so we can check for the absence of this ; value if we want to ensure scratch memory is not being used. -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 ret void } -; FUNC-LABEL: {{^}}test_concat_v2i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v2i32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}test_concat_v4i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v4i32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: {{^}}test_concat_v8i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v8i32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: {{^}}test_concat_v16i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v16i32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 ret void } -; FUNC-LABEL: {{^}}test_concat_v1f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v1f32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 ret void } -; FUNC-LABEL: {{^}}test_concat_v2f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v2f32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}test_concat_v4f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v4f32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: {{^}}test_concat_v8f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v8f32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: {{^}}test_concat_v16f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v16f32: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 ret void } -; FUNC-LABEL: {{^}}test_concat_v1i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v1i64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}test_concat_v2i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v2i64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: {{^}}test_concat_v4i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v4i64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: {{^}}test_concat_v8i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v8i64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 ret void } -; FUNC-LABEL: {{^}}test_concat_v16i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v16i64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 ret void } -; FUNC-LABEL: {{^}}test_concat_v1f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v1f64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}test_concat_v2f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v2f64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: {{^}}test_concat_v4f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v4f64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: {{^}}test_concat_v8f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v8f64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 ret void } -; FUNC-LABEL: {{^}}test_concat_v16f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v16f64: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 ret void } -; FUNC-LABEL: {{^}}test_concat_v1i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v1i1: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> store <2 x i1> %concat, <2 x i1> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}test_concat_v2i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v2i1: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> store <4 x i1> %concat, <4 x i1> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}test_concat_v4i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v4i1: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> store <8 x i1> %concat, <8 x i1> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}test_concat_v8i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v8i1: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> store <16 x i1> %concat, <16 x i1> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}test_concat_v16i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v16i1: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> store <32 x i1> %concat, <32 x i1> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}test_concat_v32i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v32i1: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> store <64 x i1> %concat, <64 x i1> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}test_concat_v1i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v1i16: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: {{^}}test_concat_v2i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v2i16: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 ret void } -; FUNC-LABEL: {{^}}test_concat_v4i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v4i16: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}test_concat_v8i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v8i16: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: {{^}}test_concat_v16i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel +; GCN-LABEL: {{^}}test_concat_v16i16: +; GCN-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; GCN-NOT: movrel define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: {{^}}concat_vector_crash: -; SI: s_endpgm +; GCN-LABEL: {{^}}concat_vector_crash: +; GCN: s_endpgm define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { bb: %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 @@ -295,8 +295,8 @@ ret void } -; FUNC-LABEL: {{^}}concat_vector_crash2: -; SI: s_endpgm +; GCN-LABEL: {{^}}concat_vector_crash2: +; GCN: s_endpgm define amdgpu_kernel void @concat_vector_crash2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) { %tmp = load i32, i32 addrspace(1)* %in, align 1 %tmp1 = trunc i32 %tmp to i24 @@ -306,3 +306,14 @@ store <8 x i8> %tmp4, <8 x i8> addrspace(1)* %out, align 8 ret void } + +; GCN-LABEL: {{^}}build_vector_splat_concat_v8i16: +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI: ds_write_b64 +; VI: ds_write2_b64 +define amdgpu_kernel void @build_vector_splat_concat_v8i16() { +entry: + store <8 x i16> zeroinitializer, <8 x i16> addrspace(3)* undef, align 16 + store <8 x i16> zeroinitializer, <8 x i16> addrspace(3)* null, align 16 + ret void +}