diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2684,6 +2684,7 @@ // COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. +let AddedComplexity = 1 in { def : GCNPat < (v2i16 (UniformBinFrag (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) @@ -2750,6 +2751,7 @@ (v2f16 (DivergentBinFrag (f16 undef), (f16 SReg_32:$src1))), (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) >; +} let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -2770,39 +2772,71 @@ >; def : GCNPat < - (v2i16 (DivergentBinFrag (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), - (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1)) + (v2i16 (UniformBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2f16 (UniformBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) +>; +// Take the lower 16 bits from each VGPR_32 and concat them def : GCNPat < - (v2i16 (UniformBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), - (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), - (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) + (v2f16 (DivergentBinFrag (f16 VGPR_32:$a), (f16 VGPR_32:$b))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) >; +// Take the lower 16 bits from each VGPR_32 and concat them def : GCNPat < - (v2i16 (DivergentBinFrag (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), - (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), - (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0)))) + (v2i16 (DivergentBinFrag (i16 VGPR_32:$a), (i16 VGPR_32:$b))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) >; +// Take the upper 16 bits from each VGPR_32 and concat them def : GCNPat < - (v2f16 (UniformBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), - (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) + (v2f16 (DivergentBinFrag (f16 (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))), (f16 (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) +>; + +// Take the upper 16 bits from V[0] and the lower 16 bits from V[1] +def : GCNPat < + (v2f16 (DivergentBinFrag (f16 VGPR_32:$a), (f16 (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060100))) >; +// Take the upper 16 bits from V[0] and the lower 16 bits from V[1] def : GCNPat < - (v2f16 (DivergentBinFrag (f16 SReg_32:$src0), (f16 SReg_32:$src1))), - (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) + (v2i16 (DivergentBinFrag (i16 VGPR_32:$a), (i16 (trunc (srl VGPR_32:$b, (i32 16)))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060100))) >; +// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] +def : GCNPat < + (v2f16 (DivergentBinFrag (f16 (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))), (f16 VGPR_32:$b))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040302))) +>; +// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] +def : GCNPat < + (v2i16 (DivergentBinFrag (i16 (trunc (srl VGPR_32:$a, (i32 16)))), (i16 VGPR_32:$b))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040302))) +>; + +// Take the upper 16 bits from each VGPR_32 and concat them +def : GCNPat < + (v2i16 (DivergentBinFrag (i16 (trunc (srl VGPR_32:$a, (i32 16)))), (i16 (trunc (srl VGPR_32:$b, (i32 16)))))), + (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) +>; + +let AddedComplexity = 5 in { def : GCNPat < (v2f16 (is_canonicalized (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) >; +} } // End SubtargetPredicate = HasVOP3PInsts // With multiple uses of the shift, this will duplicate the shift and diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -620,18 +620,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x5040302 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_add_u16 v3, v2, v3 +; GFX9-NEXT: v_perm_b32 v2, 0, v3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -642,7 +642,6 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -651,10 +650,10 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: v_pk_add_u16 v2, v1, v2 +; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x5040302 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -152,9 +152,8 @@ ; GFX9-LABEL: undef_lo2_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND @@ -178,9 +177,8 @@ ; GFX9-LABEL: undef_lo2_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -474,6 +474,7 @@ ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -483,8 +484,7 @@ ; GFX900-NEXT: v_mov_b32_e32 v1, v0 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v3 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX900-NEXT: s_endpgm @@ -544,9 +544,8 @@ ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1) ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) -; GFX10_DEFAULT-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10_DEFAULT-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 -; GFX10_DEFAULT-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX10_DEFAULT-NEXT: s_endpgm @@ -687,16 +686,27 @@ ; The volatile operations aren't put on the same chain define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_read_u16 v1, v0 offset:2 -; GCN-NEXT: ds_read_u16_d16_hi v0, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0xffff -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ds_read_u16 v1, v0 offset:2 +; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060100 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: ds_read_u16 v1, v0 offset:2 +; FLATSCR-NEXT: ds_read_u16_d16_hi v0, v0 +; FLATSCR-NEXT: s_mov_b32 s0, 0x7060100 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_perm_b32 v0, v0, v1, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: ; GFX10: ; %bb.0: ; %bb @@ -706,7 +716,7 @@ ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: @@ -718,7 +728,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060100 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 @@ -801,17 +811,29 @@ } define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_global_other_dep: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff -; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_global_other_dep: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060100 +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: global_load_short_d16_hi v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, 0x7060100 +; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_perm_b32 v0, v0, v2, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_global_other_dep: ; GFX10: ; %bb.0: ; %bb @@ -822,7 +844,7 @@ ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: chain_hi_to_lo_global_other_dep: @@ -835,7 +857,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060100 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 @@ -849,18 +871,31 @@ } define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { -; GCN-LABEL: chain_hi_to_lo_flat_other_dep: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_flat_other_dep: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060100 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: flat_load_short_d16_hi v0, v[0:1] glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, 0x7060100 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; FLATSCR-NEXT: v_perm_b32 v0, v0, v2, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep: ; GFX10: ; %bb.0: ; %bb @@ -873,7 +908,7 @@ ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] -; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: chain_hi_to_lo_flat_other_dep: @@ -886,7 +921,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060100 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 @@ -900,17 +935,29 @@ } define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { -; GCN-LABEL: chain_hi_to_lo_group_may_alias_store: -; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0x7b -; GCN-NEXT: ds_read_u16 v2, v0 -; GCN-NEXT: ds_write_b16 v1, v3 -; GCN-NEXT: ds_read_u16 v0, v0 offset:2 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store: +; GFX900: ; %bb.0: ; %bb +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX900-NEXT: ds_read_u16 v2, v0 +; GFX900-NEXT: ds_write_b16 v1, v3 +; GFX900-NEXT: ds_read_u16 v0, v0 offset:2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store: +; FLATSCR: ; %bb.0: ; %bb +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: v_mov_b32_e32 v3, 0x7b +; FLATSCR-NEXT: ds_read_u16 v2, v0 +; FLATSCR-NEXT: ds_write_b16 v1, v3 +; FLATSCR-NEXT: ds_read_u16 v0, v0 offset:2 +; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR-NEXT: v_perm_b32 v0, v2, v0, s0 +; FLATSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store: ; GFX10: ; %bb.0: ; %bb @@ -921,8 +968,7 @@ ; GFX10-NEXT: ds_write_b16 v1, v2 ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store: @@ -934,9 +980,7 @@ ; GFX11-NEXT: ds_store_b16 v1, v2 ; GFX11-NEXT: ds_load_u16 v0, v0 offset:2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -262,15 +262,15 @@ ; GFX9-LABEL: divergent_vec_i16_LL: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_i16_LL: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1 @@ -333,15 +333,15 @@ ; GFX9-LABEL: divergent_vec_i16_LH: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_i16_LH: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] %shift = lshr i32 %b, 16 %tr = trunc i32 %shift to i16 @@ -409,17 +409,15 @@ ; GFX9-LABEL: divergent_vec_i16_HH: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_i16_HH: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff0000 -; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060302 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] %shift_a = lshr i32 %a, 16 %tr_a = trunc i32 %shift_a to i16 @@ -499,15 +497,15 @@ ; GFX9-LABEL: divergent_vec_f16_LL: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: divergent_vec_f16_LL: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x half> undef, half %a, i32 0 %vec = insertelement <2 x half> %tmp, half %b, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -112,15 +112,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -252,15 +251,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -389,9 +387,9 @@ ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 @@ -582,15 +580,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -770,15 +767,14 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -955,9 +951,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB5_4: ; %exit +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -184,10 +184,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7060100 ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_perm_b32 v1, v0, 0, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -197,7 +197,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -207,7 +207,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -319,10 +319,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7060100 ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_perm_b32 v1, v0, 0, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -332,7 +332,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -342,7 +342,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -235,10 +235,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7060100 ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_perm_b32 v1, v0, 0, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -246,10 +246,10 @@ ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0x7060100 ; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_perm_b32 v1, v0, 0, v1 ; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -259,7 +259,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -269,7 +269,7 @@ ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-FLASTSCR-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -279,7 +279,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -290,7 +290,7 @@ ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-FLASTSCR-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] @@ -414,10 +414,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7060100 ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-NEXT: v_perm_b32 v1, v0, 0, v1 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -425,10 +425,10 @@ ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0x7060100 ; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 +; GFX9-FLASTSCR-NEXT: v_perm_b32 v1, v0, 0, v1 ; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -438,7 +438,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -448,7 +448,7 @@ ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX10-FLASTSCR-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; @@ -458,7 +458,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -469,7 +469,7 @@ ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-FLASTSCR-NEXT: v_perm_b32 v1, v0, 0, 0x7060100 ; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -129,30 +129,6 @@ } define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { -; VI-LABEL: v_test_canonicalize_build_vector_v2f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_test_canonicalize_build_vector_v2f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: s_setpc_b64 s[30:31] %ins0 = insertelement <2 x half> undef, half %lo, i32 0 %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1) @@ -2152,36 +2128,6 @@ } define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 { -; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; CI-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <4 x half> undef, half %val0, i32 0 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1 %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1) @@ -2189,40 +2135,6 @@ } define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 { -; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: -; CI: ; %bb.0: -; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; CI-NEXT: s_setpc_b64 s[30:31] %vec0 = insertelement <4 x half> undef, half %val0, i32 0 %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2 %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -69,8 +69,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v2f16: @@ -146,8 +146,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: @@ -241,10 +241,9 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v4f16: @@ -368,14 +367,11 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v3, v14, 16, v3 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v8, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v10, v1, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v2, v12, v2, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v3, v14, v3, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -70,8 +70,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v2f16: @@ -147,8 +147,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16: @@ -242,10 +242,9 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v4f16: @@ -369,14 +368,11 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-SAFE-NEXT: v_lshl_or_b32 v3, v14, 16, v3 +; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-SAFE-NEXT: v_perm_b32 v0, v8, v0, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v1, v10, v1, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v2, v12, v2, s4 +; GFX9-SAFE-NEXT: v_perm_b32 v3, v14, v3, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -933,8 +933,8 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v6, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v3i16: @@ -946,25 +946,24 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v6 -; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 -; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v6, v9 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 +; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX10-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX10-NEXT: v_lshlrev_b16 v1, v4, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -972,27 +971,26 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX11-NEXT: v_xor_b32_e32 v10, -1, v6 -; GFX11-NEXT: v_lshlrev_b16 v7, 1, v7 -; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX11-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX11-NEXT: v_lshrrev_b16 v4, v6, v9 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX11-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v7 ; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX11-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX11-NEXT: v_lshrrev_b16 v7, v7, v8 +; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX11-NEXT: v_lshlrev_b16 v6, v9, v6 +; GFX11-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-NEXT: v_xor_b32_e32 v2, -1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b16 v1, v2, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v1, v4, v1 +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1080,12 +1078,11 @@ ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v4i16: @@ -1100,33 +1097,31 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 ; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX10-NEXT: v_xor_b32_e32 v14, -1, v9 -; GFX10-NEXT: v_lshlrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v12, v0 +; GFX10-NEXT: v_lshlrev_b16 v9, 1, v9 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v10 +; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v5 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v4 +; GFX10-NEXT: v_lshlrev_b16 v7, v7, v8 +; GFX10-NEXT: v_lshrrev_b16 v8, v10, v11 +; GFX10-NEXT: v_lshlrev_b16 v9, v12, v9 +; GFX10-NEXT: v_lshlrev_b16 v1, v13, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v14, v0 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_lshlrev_b16 v4, v7, v8 -; GFX10-NEXT: v_lshrrev_b16 v5, v9, v13 -; GFX10-NEXT: v_lshlrev_b16 v7, v14, v10 +; GFX10-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX10-NEXT: v_or_b32_e32 v5, v9, v8 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_or_b32_e32 v2, v4, v6 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v4i16: @@ -1136,36 +1131,33 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX11-NEXT: v_xor_b32_e32 v11, -1, v5 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX11-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 ; GFX11-NEXT: v_lshlrev_b16 v8, 1, v8 ; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GFX11-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX11-NEXT: v_xor_b32_e32 v14, -1, v9 -; GFX11-NEXT: v_lshlrev_b16 v1, v11, v1 -; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 +; GFX11-NEXT: v_lshlrev_b16 v9, 1, v9 +; GFX11-NEXT: v_xor_b32_e32 v12, -1, v10 +; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v5 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v14, -1, v4 +; GFX11-NEXT: v_lshlrev_b16 v7, v7, v8 +; GFX11-NEXT: v_lshrrev_b16 v8, v10, v11 +; GFX11-NEXT: v_lshlrev_b16 v9, v12, v9 +; GFX11-NEXT: v_lshlrev_b16 v1, v13, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v14, v0 ; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 ; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX11-NEXT: v_lshlrev_b16 v4, v7, v8 -; GFX11-NEXT: v_lshrrev_b16 v5, v9, v13 -; GFX11-NEXT: v_lshlrev_b16 v7, v14, v10 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-NEXT: v_or_b32_e32 v2, v4, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v7, v5 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) ret <4 x i16> %ret diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -1032,29 +1032,29 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NODL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v1 -; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v6, 8, v1 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 +; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 +; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 -; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v10, 8, v6 -; GFX9-NODL-NEXT: v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v10, 16, v6 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 @@ -1069,29 +1069,29 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 8, v1 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 +; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s0 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 +; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 8, v6 -; GFX9-DL-NEXT: v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v9, 16, v4 +; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 @@ -1106,7 +1106,6 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1114,22 +1113,22 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v1 +; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v2 +; GFX10-DL-NEXT: v_bfe_i32 v6, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 -; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v1 -; GFX10-DL-NEXT: v_ashrrev_i16 v8, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 +; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v1 +; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v2 +; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1913,35 +1913,33 @@ ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NODL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GFX9-NODL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xff, v2 +; GFX9-NODL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-NODL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-NODL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-NODL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 -; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 -; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u16_e32 v3, v2, v3 +; GFX9-NODL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; @@ -1951,35 +1949,33 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: s_mov_b32 s1, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v7, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 24, v2 -; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v6, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xff, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s1 +; GFX9-DL-NEXT: v_perm_b32 v1, v5, v1, s1 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v9, s1 +; GFX9-DL-NEXT: v_perm_b32 v4, v4, v8, s1 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 -; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v2, v3 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 +; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1988,8 +1984,7 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1997,22 +1992,20 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16 v7, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v1 +; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2345,68 +2345,69 @@ ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 20, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 12, v5 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v10 -; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v12 -; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v13 -; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v14 -; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v15 -; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v16 -; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v18 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 12, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v2 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-NEXT: v_perm_b32 v10, v15, v14, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v10 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 @@ -2430,68 +2431,69 @@ ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 20, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 -; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v4, 12, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v6 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v8 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v10 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v11 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v12 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v13 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v9, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v11, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v14 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v16 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v17 -; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v18 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v10, 12, v10 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v1 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v15 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v17, 12, v17 +; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s0 +; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s0 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s0 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v17, v16, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v10, v9, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v15, v14, s0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v10 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 @@ -2517,79 +2519,71 @@ ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v13 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v13 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v14 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v16 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v16 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v17 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v17 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v12 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 +; GFX10-DL-XNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2618,79 +2612,71 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v14 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v6, v8 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v5, v4 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX10-DL-NOXNACK-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2213,51 +2213,44 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2279,51 +2272,44 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2353,49 +2339,41 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2930,51 +2908,44 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -2997,51 +2968,44 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 +; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 +; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 +; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 +; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 +; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 +; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 @@ -3072,49 +3036,41 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -662,12 +662,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_movk_i32 s2, 0x3e7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v2, s2, v1 +; GFX9-NEXT: v_perm_b32 v1, v1, s2, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -715,7 +715,7 @@ ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_movk_i32 s2, 0x3e7 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 +; GFX11-NEXT: v_perm_b32 v1, v1, s2, 0x7060100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -735,12 +735,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, s6, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -790,10 +789,8 @@ ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NEXT: v_lshrrev_b32_e64 v2, 16, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v2 +; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060302 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -814,11 +811,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v2, 53, v1 +; GFX9-NEXT: v_perm_b32 v1, v1, 53, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -865,7 +862,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 +; GFX11-NEXT: v_perm_b32 v1, v1, 53, 0x7060100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -885,12 +882,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_movk_i32 s2, 0x3e7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -936,10 +933,9 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_movk_i32 s2, 0x3e7 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, 0x3e7, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -958,11 +954,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, -15, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, -15, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1009,9 +1005,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, -15, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1030,12 +1024,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_movk_i32 s2, 0x4500 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, s2, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1081,10 +1075,9 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_movk_i32 s2, 0x4500 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 0x4500 +; GFX11-NEXT: v_perm_b32 v1, v1, s2, 0x7060100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1103,11 +1096,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, 53 +; GFX9-NEXT: v_perm_b32 v1, v1, 53, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1154,9 +1147,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 53 +; GFX11-NEXT: v_perm_b32 v1, v1, 53, 0x7060100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1175,12 +1166,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_movk_i32 s2, 0x4500 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1226,10 +1217,9 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_movk_i32 s2, 0x4500 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, 0x4500, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1248,11 +1238,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, 35, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, 35, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1299,9 +1289,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, 35, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1592,11 +1580,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0 +; GFX9-NEXT: v_perm_b32 v0, v0, s6, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1648,7 +1636,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s0, v0 +; GFX11-NEXT: v_perm_b32 v0, v0, s0, 0x7060100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1670,11 +1658,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s6, v0, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1726,9 +1714,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1750,11 +1736,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 +; GFX9-NEXT: v_perm_b32 v1, v1, s6, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1806,7 +1792,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 +; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1828,11 +1814,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1884,9 +1870,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1908,11 +1892,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 +; GFX9-NEXT: v_perm_b32 v1, v1, s6, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -1964,7 +1948,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 +; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2199,11 +2183,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2257,9 +2241,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2281,11 +2263,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v3, v5, s6, v3 +; GFX9-NEXT: v_perm_b32 v3, v3, s6, v5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2338,7 +2320,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 +; GFX11-NEXT: v_perm_b32 v3, v3, s0, 0x7060100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2362,44 +2344,41 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v3 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: v_lshl_or_b32 v2, v7, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s2 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s2 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -2534,44 +2513,40 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 5 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 +; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 3 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s3 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v3 -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 -; GFX11-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s1 +; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshl_or_b32 v0, v8, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2593,12 +2568,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, s6, v1, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2668,9 +2643,7 @@ ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 @@ -2695,12 +2668,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7060100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfi_b32 v3, v9, s6, v3 +; GFX9-NEXT: v_perm_b32 v3, v3, s6, v9 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2770,7 +2743,7 @@ ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 +; GFX11-NEXT: v_perm_b32 v3, v3, s0, 0x7060100 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 @@ -2798,81 +2771,74 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: v_mov_b32_e32 v9, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 14 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 15 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 14 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v9, vcc +; GFX9-NEXT: s_cmp_eq_u32 s7, 12 +; GFX9-NEXT: v_perm_b32 v0, v12, v0, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 13 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 12 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 10 +; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 11 -; GFX9-NEXT: v_lshl_or_b32 v3, v10, 16, v3 +; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 10 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 8 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 9 -; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v11, v1, s2 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s7, 8 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v7, v13, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v6, v12, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v5, v10, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v4, v11, 16, v4 +; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_perm_b32 v6, v12, v6, s2 +; GFX9-NEXT: v_perm_b32 v5, v10, v5, s2 +; GFX9-NEXT: v_perm_b32 v4, v9, v4, s2 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm @@ -3113,80 +3079,72 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 +; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 5 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: s_cmp_eq_u32 s1, 2 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 3 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 14 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v3 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 15 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 14 -; GFX11-NEXT: v_lshl_or_b32 v2, v10, 16, v2 +; GFX11-NEXT: s_cmp_eq_u32 s1, 12 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 13 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 12 +; GFX11-NEXT: s_cmp_eq_u32 s1, 10 ; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 11 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s0, s2 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 10 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s1, 9 -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 11 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s1, 8 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s3 -; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s0, s2 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s6 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v7, v10, 16, v7 -; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v6 -; GFX11-NEXT: v_lshl_or_b32 v5, v13, 16, v5 -; GFX11-NEXT: v_lshl_or_b32 v4, v14, 16, v4 -; GFX11-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 9 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s2 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s1 +; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -8,8 +8,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -19,8 +19,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -35,8 +34,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -46,8 +45,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -62,8 +60,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -73,8 +71,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -89,8 +86,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -100,8 +97,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -116,8 +112,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -127,8 +123,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -143,10 +138,10 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -156,8 +151,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -172,8 +166,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -183,8 +177,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -199,8 +192,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -210,8 +203,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -226,10 +218,10 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -239,8 +231,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -255,11 +246,11 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_perm_b32 v6, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -269,8 +260,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -283,16 +273,15 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -304,18 +293,17 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: gather4_c_l_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s12 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -327,16 +315,15 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: gather4_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -348,16 +335,15 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: gather4_c_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -31,8 +31,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -42,8 +42,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -58,8 +57,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -69,8 +68,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -85,8 +83,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -96,8 +94,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -112,8 +109,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -123,8 +120,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -139,8 +135,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -150,8 +146,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -189,8 +184,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -200,8 +195,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -216,8 +210,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -227,8 +221,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -243,8 +236,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -254,8 +247,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -270,8 +262,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -281,8 +273,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -297,10 +288,10 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -310,8 +301,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -349,8 +339,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -360,8 +350,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -399,8 +388,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -410,8 +399,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -426,8 +414,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -437,8 +425,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -453,10 +440,10 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -466,8 +453,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -482,8 +468,8 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -493,8 +479,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -509,11 +494,11 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: s_mov_b64 s[12:13], exec ; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_perm_b32 v6, v3, v2, s14 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -523,8 +508,7 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_mov_b32 s12, exec_lo ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -554,24 +538,19 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_d_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v2, v1, v0, s12 ; GFX9-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -583,15 +562,13 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r) { ; GFX9-LABEL: sample_d_3d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_perm_b32 v11, v7, v6, s12 +; GFX9-NEXT: v_perm_b32 v9, v4, v3, s12 +; GFX9-NEXT: v_perm_b32 v7, v1, v0, s12 ; GFX9-NEXT: image_sample_d v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -599,14 +576,11 @@ ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v12, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v6 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v9, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v2 +; GFX10-NEXT: v_perm_b32 v11, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v9, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v7, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -637,24 +611,19 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v2, v4, v7, s12 +; GFX9-NEXT: v_perm_b32 v1, v8, v1, s12 ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -666,16 +635,15 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_d_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s12 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -687,24 +655,19 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_d_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v5, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v4, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v3, v1, v0, s12 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -716,16 +679,15 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_c_d_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s12 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -737,26 +699,21 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_c_d_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v10, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v9, v4, v3, s12 +; GFX9-NEXT: v_perm_b32 v8, v2, v1, s12 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -768,16 +725,15 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %lod) { ; GFX9-LABEL: sample_l_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -789,16 +745,15 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) { ; GFX9-LABEL: sample_l_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12 ; GFX9-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -810,16 +765,15 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %lod) { ; GFX9-LABEL: sample_c_l_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -831,18 +785,17 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) { ; GFX9-LABEL: sample_c_l_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v5, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v4, v2, v1, s12 ; GFX9-NEXT: image_sample_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -871,16 +824,15 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) { ; GFX9-LABEL: sample_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12 ; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -909,16 +861,15 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) { ; GFX9-LABEL: sample_c_lz_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12 ; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_lz_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -930,15 +881,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { ; GFX9-LABEL: sample_c_d_o_2darray_V1: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v13, v8 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_perm_b32 v12, v7, v6, s12 +; GFX9-NEXT: v_perm_b32 v11, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v10, v3, v2, s12 ; GFX9-NEXT: image_sample_c_d_o v0, v[8:13], s[0:7], s[8:11] dmask:0x4 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -948,12 +897,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v13, v8 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v12, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -965,15 +911,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice) { ; GFX9-LABEL: sample_c_d_o_2darray_V2: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v13, v8 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_perm_b32 v12, v7, v6, s12 +; GFX9-NEXT: v_perm_b32 v11, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v10, v3, v2, s12 ; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog @@ -983,12 +927,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v13, v8 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v12, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v11, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v10, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.a16.dim.ll @@ -22,24 +22,19 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_cd_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v3, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v2, v1, v0, s12 ; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -70,24 +65,19 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v2, v4, v7, s12 +; GFX9-NEXT: v_perm_b32 v1, v8, v1, s12 ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -99,16 +89,15 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_cd_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s12 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -120,24 +109,19 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_cd_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v5, v5, v4, s12 +; GFX9-NEXT: v_perm_b32 v4, v3, v2, s12 +; GFX9-NEXT: v_perm_b32 v3, v1, v0, s12 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -149,16 +133,15 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp) { ; GFX9-LABEL: sample_c_cd_cl_1d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s12 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -170,26 +153,21 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_c_cd_cl_2d: ; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_mov_b32 s12, 0x5040100 ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v10, v6, v5, s12 +; GFX9-NEXT: v_perm_b32 v9, v4, v3, s12 +; GFX9-NEXT: v_perm_b32 v8, v2, v1, s12 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.encode.ll @@ -15,10 +15,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -41,10 +39,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd7,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -67,10 +63,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -95,10 +89,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x08,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.cd.g16.ll @@ -15,10 +15,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -41,10 +39,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -67,10 +63,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -95,10 +89,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -26,8 +26,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_d v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -48,14 +47,13 @@ ; GFX10-LABEL: sample_d_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX10-NEXT: v_mov_b32_e32 v13, v5 ; GFX10-NEXT: v_mov_b32_e32 v12, v4 ; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 ; GFX10-NEXT: image_sample_d v[0:3], v[8:15], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -96,8 +94,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -117,8 +114,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -139,12 +135,11 @@ ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v5, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_perm_b32 v10, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_d_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -166,8 +161,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -188,13 +182,12 @@ ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_lshl_or_b32 v12, v6, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_perm_b32 v12, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -235,8 +228,7 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_cd v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -275,8 +267,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, half %s, half %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -296,8 +287,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -318,12 +308,11 @@ ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v11, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_lshl_or_b32 v10, v5, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_perm_b32 v10, v5, v4, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[6:11], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -345,8 +334,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, half %s, half %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -367,13 +355,12 @@ ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v7 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v5 ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_lshl_or_b32 v12, v6, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_perm_b32 v12, v6, v5, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[7:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -396,14 +383,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX10-NEXT: v_mov_b32_e32 v13, v5 ; GFX10-NEXT: v_mov_b32_e32 v12, v4 ; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o v0, v[8:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -426,14 +412,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v15, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GFX10-NEXT: v_mov_b32_e32 v13, v5 ; GFX10-NEXT: v_mov_b32_e32 v12, v4 ; GFX10-NEXT: v_mov_b32_e32 v11, v3 ; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_lshl_or_b32 v14, v7, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_perm_b32 v14, v7, v6, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o v[0:1], v[8:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -497,10 +482,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -524,10 +507,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -574,10 +555,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -619,10 +598,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -666,10 +643,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -714,10 +689,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -759,10 +732,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -804,10 +775,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -851,10 +820,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -879,14 +846,12 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -913,14 +878,12 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -22,21 +22,16 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_d_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -50,10 +45,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x04,0x13,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -62,12 +55,9 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf] -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04] +; GFX11-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x04,0x13,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -96,21 +86,16 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd7,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04] -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -139,21 +124,16 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd7,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd7,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_d_cl_2d: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04] -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04] +; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -184,10 +164,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd7,0x08,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -196,11 +174,9 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04] -; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf] +; GFX11-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x08,0x03,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -212,29 +188,25 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd7,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_o_2darray_V1: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX11-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] -; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; encoding: [0x04,0x00,0x87,0xbf] +; GFX11-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog @@ -246,29 +218,25 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd7,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd7,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: sample_c_d_o_2darray_V2: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX11-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v2 ; encoding: [0x02,0x03,0x14,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] ; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf] -; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04] -; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04] +; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; encoding: [0x04,0x00,0x87,0xbf] +; GFX11-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x09,0x15,0xfe,0x03,0x00,0x01,0x04,0x05] ; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -16,10 +16,8 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -33,10 +31,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -59,10 +55,8 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -85,10 +79,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -113,10 +105,8 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 +; GFX10-NEXT: v_perm_b32 v4, v4, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -128,14 +118,12 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -147,14 +135,12 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice) { ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v10, v3 +; GFX10-NEXT: v_mov_b32_e32 v9, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, v2 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 +; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v4, v9, v10, 0x5040100 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -24,12 +24,12 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v1 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -77,12 +77,12 @@ ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX900-NEXT: ds_read_u16 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: ds_write_b16 v2, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -91,12 +91,12 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -120,12 +120,12 @@ ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_read_u16 v1, v0 offset:16 ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v1 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: @@ -144,12 +144,12 @@ ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v3, v0 ; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: ds_write_b16 v1, v3 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -158,12 +158,12 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v3, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v1, v3 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: ds_write_b16 v2, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -187,12 +187,12 @@ ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_read_u16 v3, v0 ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: ds_write_b16 v1, v3 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0 -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: @@ -256,9 +256,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_hi_v2i16_reglo: @@ -299,9 +299,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -430,9 +430,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -479,9 +479,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -529,9 +529,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -579,9 +579,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -631,9 +631,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -683,9 +683,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -734,9 +734,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -785,9 +785,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -837,9 +837,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -889,9 +889,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -942,9 +942,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -995,9 +995,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1043,9 +1043,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1091,9 +1091,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1140,9 +1140,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1189,9 +1189,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1239,9 +1239,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1289,9 +1289,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1338,9 +1338,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1388,8 +1388,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1437,8 +1437,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1485,9 +1485,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1535,9 +1535,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1586,9 +1586,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1637,9 +1637,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1688,8 +1688,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1738,8 +1738,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1788,8 +1788,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1838,9 +1838,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1889,9 +1889,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1940,9 +1940,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1993,9 +1993,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2055,9 +2055,9 @@ ; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2119,9 +2119,9 @@ ; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4059 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2184,9 +2184,9 @@ ; GFX906-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4059 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2247,10 +2247,9 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:2 -; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_v2i16_split_multi_chain: @@ -2298,10 +2297,9 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v1, v0 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 -; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_hi_v2i16_samechain: @@ -2340,18 +2338,18 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_v2i16_broadcast: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_v2i16_broadcast: @@ -2368,9 +2366,9 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v0, s0 ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 @@ -2400,10 +2398,9 @@ ; GFX906-NEXT: ds_read_u16 v2, v0 ; GFX906-NEXT: ds_write_b16 v1, v3 ; GFX906-NEXT: ds_read_u16 v0, v0 offset:16 -; GFX906-NEXT: s_waitcnt lgkmcnt(2) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_hi_v2i16_side_effect: @@ -2459,8 +2456,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_load_ushort v3, v[0:1], off offset:2 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX906-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_v2i16_split: @@ -2512,9 +2509,10 @@ ; GFX906-NEXT: flat_load_ushort v2, v[0:1] glc ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: flat_load_ushort v3, v[0:1] offset:2 glc -; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX906-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_v2i16_split: @@ -2565,10 +2563,9 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v2, v[0:1], off glc ; GFX906-NEXT: global_load_ushort v3, v[0:1], off offset:2 glc -; GFX906-NEXT: s_waitcnt vmcnt(1) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v3, v2, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_constant_v2i16_split: @@ -2620,8 +2617,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_v2i16_split: @@ -2671,10 +2668,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v2, v1 -; GFX906-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: ds_write_b16 v1, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX906-NEXT: v_perm_b32 v2, v2, v0, s4 ; GFX906-NEXT: v_mov_b32_e32 v0, v2 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2711,3 +2708,4 @@ } attributes #0 = { nounwind } + diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -33,22 +33,22 @@ } define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reglo: @@ -60,6 +60,15 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 @@ -69,24 +78,24 @@ ; Show that we get reasonable regalloc without physreg constraints. define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -102,6 +111,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1 @@ -156,9 +176,9 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: s_movk_i32 s4, 0x4000 +; GFX906-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, s4, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2f16_fpimm: @@ -189,10 +209,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -217,24 +236,24 @@ } define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { -; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -250,6 +269,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load half, half addrspace(3)* %in %build0 = insertelement <2 x half> undef, half %reg, i32 1 @@ -272,9 +302,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -300,24 +330,24 @@ } define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -333,6 +363,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = zext i8 %load to i16 @@ -356,9 +397,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -384,24 +425,24 @@ } define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_i8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_i8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -417,6 +458,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_i8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = sext i8 %load to i16 @@ -427,24 +479,24 @@ } define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { -; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -460,6 +512,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = zext i8 %load to i16 @@ -471,24 +534,24 @@ } define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { -; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_i8 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_i8 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_i8 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -504,6 +567,17 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_i8 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in %ext = sext i8 %load to i16 @@ -515,28 +589,28 @@ } define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x7060100 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_write_b16 v2, v0 +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 ; GFX906-NEXT: v_mov_b32_e32 v2, 0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 -; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -554,6 +628,19 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x7060100 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %elt1 = extractelement <2 x i16> %reg, i32 1 @@ -580,12 +667,12 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX906-NEXT: v_mov_b32_e32 v3, 0 ; GFX906-NEXT: ds_write_b16 v3, v2 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(1) -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -614,30 +701,30 @@ } define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { -; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: ds_write_b16 v3, v4 -; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: s_setpc_b64 s[30:31] +; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: +; GFX900-MUBUF: ; %bb.0: ; %entry +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_read_u16 v0, v0 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x7060100 +; GFX900-MUBUF-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX900-MUBUF-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-MUBUF-NEXT: ds_write_b16 v2, v0 +; GFX900-MUBUF-NEXT: ds_write_b16 v3, v4 +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off +; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 ; GFX906-NEXT: ds_write_b16 v3, v4 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -656,6 +743,20 @@ ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: +; GFX900-FLATSCR: ; %bb.0: ; %entry +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x7060100 +; GFX900-FLATSCR-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-FLATSCR-NEXT: ds_write_b16 v2, v0 +; GFX900-FLATSCR-NEXT: ds_write_b16 v3, v4 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off +; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in %elt1 = extractelement <2 x i16> %reg, i32 1 @@ -680,9 +781,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -722,10 +823,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -765,9 +865,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -808,9 +908,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -851,10 +951,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -896,10 +995,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -941,9 +1039,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -980,10 +1078,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ushort v0, v[0:1] -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1022,9 +1119,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1062,9 +1159,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1102,10 +1199,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_ubyte v0, v[0:1] -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1144,10 +1240,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: flat_load_sbyte v0, v[0:1] -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1186,9 +1281,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1226,9 +1321,9 @@ ; GFX900-MUBUF: ; %bb.0: ; %entry ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-MUBUF-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 +; GFX900-MUBUF-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) -; GFX900-MUBUF-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-MUBUF-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-MUBUF-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off ; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) ; GFX900-MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -1237,9 +1332,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 +; GFX906-NEXT: s_mov_b32 s4, 0x5040100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1259,9 +1354,9 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: scratch_load_ushort v1, off, s32 offset:4094 +; GFX900-FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX900-FLATSCR-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-FLATSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-FLATSCR-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -1288,10 +1383,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1339,8 +1433,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1388,8 +1482,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1437,9 +1531,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1486,9 +1579,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1536,9 +1629,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1587,8 +1680,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1637,8 +1730,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1687,9 +1780,8 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1738,9 +1830,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1780,10 +1872,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1823,10 +1914,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1868,10 +1958,9 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095 -; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX906-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1919,11 +2008,11 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v3, 44 -; GFX906-NEXT: buffer_load_ushort v1, v3, s[0:3], s32 offen offset:4054 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -1988,11 +2077,11 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v3, 44 -; GFX906-NEXT: buffer_load_sbyte v1, v3, s[0:3], s32 offen offset:4055 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2058,11 +2147,11 @@ ; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v3, 44 -; GFX906-NEXT: buffer_load_ubyte v1, v3, s[0:3], s32 offen offset:4055 glc +; GFX906-NEXT: v_mov_b32_e32 v2, 44 +; GFX906-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2131,9 +2220,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 44 ; GFX906-NEXT: buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -2203,9 +2291,8 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 44 ; GFX906-NEXT: buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX906-NEXT: s_mov_b32 s4, 0x7060100 +; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -169,8 +169,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -247,10 +247,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -326,11 +326,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x1234 +; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -387,11 +388,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x4400 +; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -448,12 +450,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x1234 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -510,12 +512,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x3c00 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -572,11 +574,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, 64, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, 64, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -165,8 +165,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -241,10 +241,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v2, v1, s0 ; GFX9-NEXT: v_add_u32_e32 v0, 9, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -318,11 +318,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_movk_i32 s0, 0x7b +; GFX9-NEXT: v_perm_b32 v0, v0, s0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -378,10 +379,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 64 +; GFX9-NEXT: v_perm_b32 v0, v0, 64, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -437,12 +439,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_movk_i32 s0, 0x7b -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, s0, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -498,11 +500,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, 7, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, 7, v0, v1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v0 ; GFX9-NEXT: ;;#ASMEND @@ -557,3 +559,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } + diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -168,8 +168,9 @@ ; GFX9-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fadd_v4f16_fpexcept_strict: @@ -187,14 +188,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v5, v1, v3 -; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict: @@ -202,17 +201,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_add_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_add_f16_e32 v3, v7, v6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_add_f16_e32 v2, v6, v5 +; GFX11-NEXT: v_add_f16_e32 v3, v7, v4 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -115,13 +115,12 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 ; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: v_fma_f16 v0, v0, v2, v4 -; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict: @@ -154,13 +153,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fma_v4f16_fpexcept_strict: @@ -174,13 +171,11 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX11-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX11-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX11-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX11-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX11-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; GFX11-NEXT: v_lshl_or_b32 v0, v9, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -168,8 +168,9 @@ ; GFX9-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fmul_v4f16_fpexcept_strict: @@ -187,14 +188,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mul_f16_e32 v5, v1, v3 -; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fmul_v4f16_fpexcept_strict: @@ -202,17 +201,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_mul_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_mul_f16_e32 v3, v7, v6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_mul_f16_e32 v2, v6, v5 +; GFX11-NEXT: v_mul_f16_e32 v3, v7, v4 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -62,7 +62,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_strict: @@ -77,10 +78,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_strict: @@ -90,9 +90,8 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val @@ -104,7 +103,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: @@ -119,10 +119,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: @@ -132,9 +131,8 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val @@ -146,7 +144,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: @@ -161,10 +160,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v2, v0, v1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: @@ -174,9 +172,8 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val @@ -188,7 +185,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -205,11 +203,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v3f16_fpexcept_strict: @@ -221,8 +218,7 @@ ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 ; GFX11-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX11-NEXT: v_sub_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val @@ -237,8 +233,9 @@ ; GFX9-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_sub_f16_e32 v1, v1, v3 ; GFX9-NEXT: v_sub_f16_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -256,14 +253,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_sub_f16_e32 v5, v1, v3 -; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX10-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_constained_fsub_v4f16_fpexcept_strict: @@ -271,17 +266,15 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_sub_f16_e32 v2, v5, v4 -; GFX11-NEXT: v_sub_f16_e32 v3, v7, v6 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_sub_f16_e32 v2, v6, v5 +; GFX11-NEXT: v_sub_f16_e32 v3, v7, v4 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -720,18 +720,18 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0x5040302 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_pk_sub_i16 v3, v2, v3 +; GFX9-NEXT: v_perm_b32 v2, 0, v3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -767,7 +767,6 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -776,10 +775,10 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 +; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x5040302 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -166,8 +166,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_perm_b32 v0, s4, v5, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -179,7 +180,7 @@ ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, s4, v5, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -191,7 +192,7 @@ ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -206,8 +207,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_perm_b32 v0, s4, v5, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -219,7 +221,7 @@ ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, s4, v5, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -231,7 +233,7 @@ ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -246,12 +248,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -261,12 +260,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dword v4, v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x7060302 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -276,13 +271,8 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: global_load_b32 v1, v[2:3], off -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -294,46 +284,38 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v1, s4, v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v4, v6, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v1, s4, v5, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_357u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_perm_b32 v1, s0, v3, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -909,10 +891,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v6, v5, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -923,10 +904,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v6, v5, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -938,10 +917,7 @@ ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -956,10 +932,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v6, v5, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -970,12 +945,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v0, v6, v5, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_5623: @@ -985,10 +958,7 @@ ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -1001,48 +971,37 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v1, v5, v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX9-NEXT: v_perm_b32 v0, v4, v6, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v1, v5, v4, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x5040302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_3456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1054,15 +1013,13 @@ ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v4, v6, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_5634: @@ -1071,13 +1028,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 +; GFX10-NEXT: v_perm_b32 v1, v4, v6, 0x5040302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_5634: @@ -1085,17 +1039,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1107,16 +1055,14 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_mov_b32 s5, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v4, v6, s5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_5734: @@ -1125,14 +1071,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v1, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v1, v4, v6, 0x5040302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_5734: @@ -1140,18 +1082,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1165,10 +1100,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, v6, v5, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1179,10 +1113,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, v6, v5, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1248,9 +1180,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1260,8 +1192,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1271,9 +1202,8 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -1287,10 +1217,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1300,9 +1229,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040302 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1312,10 +1239,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040302 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1329,13 +1253,12 @@ ; GFX9-LABEL: shuffle_v4f16_1100: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_mov_b32 s5, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v1, s5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_1100: @@ -1344,11 +1267,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_1100: @@ -1357,13 +1277,8 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1375,13 +1290,11 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1389,13 +1302,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4 -; GFX10-NEXT: global_load_dword v5, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x7060100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1403,14 +1313,11 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060100 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -1424,10 +1331,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_2333: @@ -1436,9 +1342,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_2333: @@ -1447,10 +1351,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1463,10 +1364,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_6667: @@ -1475,9 +1375,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_6667: @@ -1486,10 +1384,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1642,10 +1537,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 ; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 +; GFX9-NEXT: v_perm_b32 v0, v6, v5, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1656,12 +1550,10 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8 ; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_perm_b32 v0, v6, v5, 0x5040302 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8f16_13_14_2_3: @@ -1671,10 +1563,7 @@ ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 ; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_perm_b32 v0, v3, v2, 0x5040302 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 @@ -1688,9 +1577,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_perm_b32 v1, v1, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v3f16_0122: @@ -1699,8 +1588,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v3f16_0122: @@ -1709,9 +1597,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: v_perm_b32 v1, v1, v1, 0x5040100 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 @@ -1724,10 +1610,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2f16_0122: @@ -1735,10 +1620,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x5040302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2f16_0122: @@ -1747,10 +1630,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x5040302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 @@ -1907,14 +1787,13 @@ ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_mov_b32 s5, 0x5040302 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: v_perm_b32 v1, v6, v5, s5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_0456: @@ -1925,29 +1804,22 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v6, v5, 0x5040302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_0456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040302 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -2006,3 +1878,309 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone speculatable } +define <2 x half> @low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x half> @hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x half> @low16hi16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: low16hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: low16hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060100 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: low16hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060100 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x half> @hi16low16bits(<2 x half> addrspace(1)* %x0, <2 x half> addrspace(1)* %x1) { +; GFX9-LABEL: hi16low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: hi16low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: hi16low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040302 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x half>, <2 x half> addrspace(1)* %x0, align 4 + %1 = load <2 x half>, <2 x half> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> + ret <2 x half> %vy1.2.vec.insert +} + +define <2 x i16> @i16_low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +} + +define <2 x i16> @i16_low16hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_low16hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060100 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_low16hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060100 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_low16hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060100 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +} + +define <2 x i16> @i16_hi16low16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_hi16low16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_hi16low16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_hi16low16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +} + +define <2 x i16> @i16_hi16bits(<2 x i16> addrspace(1)* %x0, <2 x i16> addrspace(1)* %x1) { +; GFX9-LABEL: i16_hi16bits: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: i16_hi16bits: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_hi16bits: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %x0, align 4 + %1 = load <2 x i16>, <2 x i16> addrspace(1)* %x1, align 4 + %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> + %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> + ret <2 x i16> %vy1.2.vec.insert +}