diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2797,12 +2797,25 @@ (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) >; +// Take the lower 16 bits from each VGPR_32 and concat them +def : GCNPat < + (i32 (bitconvert (v2f16 (DivergentBinFrag (f16 (bitconvert (i16 (trunc VGPR_32:$a)))), (f16 (bitconvert (i16 (trunc VGPR_32:$b)))))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) +>; + +// Take the upper 16 bits from each VGPR_32 and concat them +def : GCNPat < + (i32 (bitconvert (v2f16 (DivergentBinFrag (f16 (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))), (f16 (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) +>; +let AddedComplexity = 5 in { def : GCNPat < (v2f16 (is_canonicalized (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) >; +} } // End SubtargetPredicate = HasVOP3PInsts // With multiple uses of the shift, this will duplicate the shift and diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -178,9 +178,8 @@ ; GFX9-LABEL: undef_lo2_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -389,9 +389,9 @@ ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 @@ -955,9 +955,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB5_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2199,11 +2199,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2257,9 +2257,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2593,12 +2591,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2668,9 +2666,7 @@ ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -169,8 +169,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -247,10 +247,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/v_perm_non_canon.ll b/llvm/test/CodeGen/AMDGPU/v_perm_non_canon.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/v_perm_non_canon.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +define amdgpu_kernel void @transpose_fp16(<2 x half>* %x0, <2 x half>* %x1, <2 x half>* %y0, <2 x half>* %y1) { +; GCN-LABEL: transpose_fp16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_load_dword v4, v[0:1] +; GCN-NEXT: flat_load_dword v5, v[2:3] +; GCN-NEXT: s_mov_b32 s0, 0x5040100 +; GCN-NEXT: s_mov_b32 s1, 0x7060302 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_perm_b32 v6, v5, v4, s0 +; GCN-NEXT: v_perm_b32 v4, v5, v4, s1 +; GCN-NEXT: flat_store_dword v[0:1], v6 +; GCN-NEXT: flat_store_dword v[2:3], v4 +; GCN-NEXT: s_endpgm +entry: + %0 = load <2 x half>, <2 x half>* %x0, align 4 + %1 = load <2 x half>, <2 x half>* %x1, align 4 + %vy0.2.vec.insert = shufflevector <2 x half> %0, <2 x half> %1, <2 x i32> + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + store <2 x half> %vy0.2.vec.insert, <2 x half>* %y0, align 4 + store <2 x half> %vy1.2.vec.insert, <2 x half>* %y1, align 4 + ret void +}