diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2610,12 +2610,25 @@ (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) >; +// Take the lower 16 bits from each VGPR_32 and concat them +def : GCNPat < + (i32 (bitconvert (v2f16 (DivergentBinFrag (f16 (bitconvert (i16 (trunc VGPR_32:$a)))), (f16 (bitconvert (i16 (trunc VGPR_32:$b)))))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) +>; + +// Take the upper 16 bits from each VGPR_32 and concat them +def : GCNPat < + (i32 (bitconvert (v2f16 (DivergentBinFrag (f16 (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))), (f16 (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) +>; +let AddedComplexity = 5 in { def : GCNPat < (v2f16 (is_canonicalized (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) >; +} } // End SubtargetPredicate = HasVOP3PInsts def : GCNPat < diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -178,9 +178,8 @@ ; GFX9-LABEL: undef_lo2_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -401,9 +401,9 @@ ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1758,10 +1758,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16 ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v5, v1, s4 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -1,14 +1,37 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}s_pack_v2f16: -; GFX9: s_load_dword [[VAL0:s[0-9]+]] -; GFX9: s_load_dword [[VAL1:s[0-9]+]] -; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 { +; GFX9-LABEL: s_pack_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm +; +; GCN-LABEL: s_pack_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm %val0 = load volatile i32, i32 addrspace(4)* %in0 %val1 = load volatile i32, i32 addrspace(4)* %in1 %lo.i = trunc i32 %val0 to i16 @@ -23,11 +46,31 @@ ret void } -; GCN-LABEL: {{^}}s_pack_v2f16_imm_lo: -; GFX9: s_load_dword [[VAL1:s[0-9]+]] -; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]] -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(4)* %in1) #0 { +; GFX9-LABEL: s_pack_v2f16_imm_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x1234, s0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm +; +; GCN-LABEL: s_pack_v2f16_imm_lo: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: s_or_b32 s0, s0, 0x1234 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm %val1 = load i32, i32 addrspace(4)* %in1 %hi.i = trunc i32 %val1 to i16 %hi = bitcast i16 %hi.i to half @@ -39,11 +82,31 @@ ret void } -; GCN-LABEL: {{^}}s_pack_v2f16_imm_hi: -; GFX9: s_load_dword [[VAL0:s[0-9]+]] -; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234 -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 { +; GFX9-LABEL: s_pack_v2f16_imm_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x1234 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm +; +; GCN-LABEL: s_pack_v2f16_imm_hi: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NEXT: s_or_b32 s0, s0, 0x12340000 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s0 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm %val0 = load i32, i32 addrspace(4)* %in0 %lo.i = trunc i32 %val0 to i16 %lo = bitcast i16 %lo.i to half @@ -55,14 +118,22 @@ ret void } -; GCN-LABEL: {{^}}v_pack_v2f16: -; GFX9: global_load_dword [[VAL0:v[0-9]+]] -; GFX9: global_load_dword [[VAL1:v[0-9]+]] - -; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { +; GFX9-LABEL: v_pack_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -80,15 +151,24 @@ ret void } -; GCN-LABEL: {{^}}v_pack_v2f16_user: -; GFX9: global_load_dword [[VAL0:v[0-9]+]] -; GFX9: global_load_dword [[VAL1:v[0-9]+]] - -; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] - -; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { +; GFX9-LABEL: v_pack_v2f16_user: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 +; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -107,13 +187,20 @@ ret void } -; GCN-LABEL: {{^}}v_pack_v2f16_imm_lo: -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]] - -; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}} -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 { +; GFX9-LABEL: v_pack_v2f16_imm_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext @@ -127,14 +214,20 @@ ret void } -; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_lo: -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]] - -; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4400{{$}} -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] - -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { +; GFX9-LABEL: v_pack_v2f16_inline_imm_lo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext @@ -148,15 +241,21 @@ ret void } -; GCN-LABEL: {{^}}v_pack_v2f16_imm_hi: -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]] - -; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 -; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] - -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 { +; GFX9-LABEL: v_pack_v2f16_imm_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x1234 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -170,15 +269,21 @@ ret void } -; GCN-LABEL: {{^}}v_pack_v2f16_inline_f16imm_hi: -; GFX9-DAG: global_load_dword [[VAL:v[0-9]+]] - -; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3c00 -; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]] -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] - -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 { +; GFX9-LABEL: v_pack_v2f16_inline_f16imm_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x3c00 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext @@ -192,14 +297,20 @@ ret void } -; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_hi: -; GFX9: global_load_dword [[VAL:v[0-9]+]] - -; GFX9: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]] -; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]] - -; GFX9: ; use [[PACKED]] define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { +; GFX9-LABEL: v_pack_v2f16_inline_imm_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, 64, 16, v0 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/v_perm_non_canon.ll b/llvm/test/CodeGen/AMDGPU/v_perm_non_canon.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/v_perm_non_canon.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +define amdgpu_kernel void @transpose_fp16(<2 x half>* nocapture noundef nonnull readonly align 4 dereferenceable(4) %x0, <2 x half>* nocapture noundef nonnull readonly align 4 dereferenceable(4) %x1, <2 x half>* nocapture noundef nonnull writeonly align 4 dereferenceable(4) %y0, <2 x half>* nocapture noundef nonnull writeonly align 4 dereferenceable(4) %y1) { +; GCN-LABEL: transpose_fp16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_load_dword v4, v[0:1] +; GCN-NEXT: flat_load_dword v5, v[2:3] +; GCN-NEXT: s_mov_b32 s0, 0x5040100 +; GCN-NEXT: s_mov_b32 s1, 0x7060302 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_perm_b32 v6, v5, v4, s0 +; GCN-NEXT: v_perm_b32 v4, v5, v4, s1 +; GCN-NEXT: flat_store_dword v[0:1], v6 +; GCN-NEXT: flat_store_dword v[2:3], v4 +; GCN-NEXT: s_endpgm +entry: + %0 = load <2 x half>, <2 x half>* %x0, align 4 + %1 = load <2 x half>, <2 x half>* %x1, align 4 + %vy0.2.vec.insert = shufflevector <2 x half> %0, <2 x half> %1, <2 x i32> + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + store <2 x half> %vy0.2.vec.insert, <2 x half>* %y0, align 4 + store <2 x half> %vy1.2.vec.insert, <2 x half>* %y1, align 4 + ret void +}