Index: llvm/test/CodeGen/AMDGPU/build_vector.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/build_vector.ll +++ llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -1,35 +1,220 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefixes=R600 +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=GFX6 +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8 +; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10 -; R600: {{^}}build_vector2: -; R600: MOV -; R600: MOV -; R600-NOT: MOV -; SI: {{^}}build_vector2: -; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 -; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 -; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}} define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) { +; R600-LABEL: build_vector2: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.Y, literal.x, +; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; R600-NEXT: MOV T0.X, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45) +; +; GFX6-LABEL: build_vector2: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, 6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: build_vector2: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, 6 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: build_vector2: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, 6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm entry: store <2 x i32> , <2 x i32> addrspace(1)* %out ret void } -; R600: {{^}}build_vector4: -; R600: MOV -; R600: MOV -; R600: MOV -; R600: MOV -; R600-NOT: MOV -; SI: {{^}}build_vector4: -; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 -; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 -; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7 -; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8 -; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}} define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) { +; R600-LABEL: build_vector4: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV * T0.W, literal.x, +; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; R600-NEXT: MOV * T0.Z, literal.x, +; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; R600-NEXT: MOV * T0.Y, literal.x, +; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; R600-NEXT: MOV T0.X, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45) +; +; GFX6-LABEL: build_vector4: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 5 +; GFX6-NEXT: v_mov_b32_e32 v1, 6 +; GFX6-NEXT: v_mov_b32_e32 v2, 7 +; GFX6-NEXT: v_mov_b32_e32 v3, 8 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: build_vector4: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 5 +; GFX8-NEXT: v_mov_b32_e32 v1, 6 +; GFX8-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: build_vector4: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, 6 +; GFX10-NEXT: v_mov_b32_e32 v2, 7 +; GFX10-NEXT: v_mov_b32_e32 v3, 8 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm entry: store <4 x i32> , <4 x i32> addrspace(1)* %out ret void } + +define amdgpu_kernel void @build_vector_v2i16 (<2 x i16> addrspace(1)* %out) { +; R600-LABEL: build_vector_v2i16: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: MOV T4.X, literal.x, +; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; R600-NEXT: 393221(5.510200e-40), 2(2.802597e-45) +; +; GFX6-LABEL: build_vector_v2i16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: build_vector_v2i16: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x60005 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: build_vector_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +entry: + store <2 x i16> , <2 x i16> addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @build_vector_v2i16_trunc (<2 x i16> addrspace(1)* %out, i32 %a) { +; R600-LABEL: build_vector_v2i16_trunc: +; R600: ; %bb.0: +; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x, +; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; R600-NEXT: OR_INT T4.X, PV.W, literal.x, +; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, +; R600-NEXT: 327680(4.591775e-40), 2(2.802597e-45) +; +; GFX6-LABEL: build_vector_v2i16_trunc: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s4, s2, 0x50000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: build_vector_v2i16_trunc: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s0, 0x50000 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: build_vector_v2i16_trunc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm + %srl = lshr i32 %a, 16 + %trunc = trunc i32 %srl to i16 + %ins.0 = insertelement <2 x i16> undef, i16 %trunc, i32 0 + %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1 + store <2 x i16> %ins.1, <2 x i16> addrspace(1)* %out + ret void +}